Unverified Commit 1ac4fd7a authored by André's avatar André

Move regular expressions to separate file

parent c46ee9a2
......@@ -80,6 +80,8 @@
from gajim.common.const import URIAction
from gajim.common.const import GIO_TLS_ERRORS
from gajim.common.const import SHOW_LIST
from gajim.common.regex import INVALID_XML_CHARS_REGEX
from gajim.common.regex import STH_AT_STH_DOT_STH_REGEX
from gajim.common.structs import URI
......@@ -632,7 +634,7 @@ def get_auth_sha(sid, initiator, target):
def remove_invalid_xml_chars(string_):
if string_:
string_ = re.sub(app.interface.invalid_XML_chars_re, '', string_)
string_ = re.sub(INVALID_XML_CHARS_REGEX, '', string_)
return string_
def get_random_string(count=16):
......@@ -1067,7 +1069,7 @@ def parse_uri(uri):
uri = uri[4:]
return URI(type=URIType.TEL, data=uri)
if app.interface.sth_at_sth_dot_sth_re.match(uri):
if STH_AT_STH_DOT_STH_REGEX.match(uri):
return URI(type=URIType.AT, data=uri)
if uri.startswith('geo:'):
......
import re
def _get_link_pattern():
# regexp meta characters are: . ^ $ * + ? { } [ ] \ | ( )
# one escapes the metachars with \
# \S matches anything but ' ' '\t' '\n' '\r' '\f' and '\v'
# \s matches any whitespace character
# \w any alphanumeric character
# \W any non-alphanumeric character
# \b means word boundary. This is a zero-width assertion that
# matches only at the beginning or end of a word.
# ^ matches at the beginning of lines
#
# * means 0 or more times
# + means 1 or more times
# ? means 0 or 1 time
# | means or
# [^*] anything but '*' (inside [] you don't have to escape metachars)
# [^\s*] anything but whitespaces and '*'
# (?<!\S) is a one char lookbehind assertion and asks for any leading
# whitespace
# and matches beginning of lines so we have correct formatting detection
# even if the text is just '*foo*'
# (?!\S) is the same thing but it's a lookahead assertion
# \S*[^\s\W] --> in the matching string don't match ? or ) etc.. if at
# the end
# so http://be) will match http://be and http://be)be) will match
# http://be)be
legacy_prefixes = r"((?<=\()(www|ftp)\.([A-Za-z0-9\.\-_~:/\?#\[\]@!\$"\
r"&'\(\)\*\+,;=]|%[A-Fa-f0-9]{2})+(?=\)))"\
r"|((www|ftp)\.([A-Za-z0-9\.\-_~:/\?#\[\]@!\$&'\(\)\*\+,;=]"\
r"|%[A-Fa-f0-9]{2})+"\
r"\.([A-Za-z0-9\.\-_~:/\?#\[\]@!\$&'\(\)\*\+,;=]|%[A-Fa-f0-9]{2})+)"
# NOTE: it's ok to catch www.gr such stuff exist!
# FIXME: recognize xmpp: and treat it specially
links = r"((?<=\()[A-Za-z][A-Za-z0-9\+\.\-]*:"\
r"([\w\.\-_~:/\?#\[\]@!\$&'\(\)\*\+,;=]|%[A-Fa-f0-9]{2})+"\
r"(?=\)))|(\w[\w\+\.\-]*:([^<>\s]|%[A-Fa-f0-9]{2})+)"
# 2nd one: at_least_one_char@at_least_one_char.at_least_one_char
mail = r'\bmailto:\S*[^\s\W]|' r'\b\S+@\S+\.\S*[^\s\W]'
link_pattern = links + '|' + mail + '|' + legacy_prefixes
return link_pattern
def _get_basic_pattern():
basic_pattern = _get_link_pattern()
# detects eg. *b* *bold* *bold bold* test *bold* *bold*! (*bold*)
# doesn't detect (it's a feature :P) * bold* *bold * * bold * test*bold*
formatting = r'|(?<!\w)' r'\*[^\s*]' r'([^*]*[^\s*])?' r'\*(?!\w)|'\
r'(?<!\S)' r'~[^\s~]' r'([^~]*[^\s~])?' r'~(?!\S)|'\
r'(?<!\w)' r'_[^\s_]' r'([^_]*[^\s_])?' r'_(?!\w)'
return basic_pattern + formatting
def _get_emot_and_basic_pattern(use_ascii_formatting=True):
from gajim.gui.emoji_data import emoji_data
# because emoticons match later (in the string) they need to be after
# basic matches that may occur earlier
emoticons = emoji_data.get_regex()
if use_ascii_formatting:
pattern = _get_basic_pattern()
else:
pattern = _get_link_pattern()
return '%s|%s' % (pattern, emoticons)
LINK_REGEX = re.compile(_get_link_pattern(), re.I | re.U)
# link pattern + ASCII formatting
BASIC_REGEX = re.compile(_get_basic_pattern(), re.IGNORECASE)
# emoticons + link pattern
EMOT_AND_LINK_REGEX = re.compile(_get_emot_and_basic_pattern(False),
re.IGNORECASE)
# emoticons + link pattern + ASCII formatting
EMOT_AND_BASIC_REGEX = re.compile(_get_emot_and_basic_pattern(True),
re.IGNORECASE)
INVALID_XML_CHARS_REGEX = re.compile(
'[\x00-\x08]|[\x0b-\x0c]|[\x0e-\x1f]|[\ud800-\udfff]|[\ufffe-\uffff]')
# at least one character in 3 parts (before @, after @, after .)
STH_AT_STH_DOT_STH_REGEX = re.compile(
r'^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$')
......@@ -44,6 +44,11 @@
from gajim.common.const import Trust
from gajim.common.const import URI_SCHEMES
from gajim.common.helpers import to_user_string
from gajim.common.regex import STH_AT_STH_DOT_STH_REGEX
from gajim.common.regex import BASIC_REGEX
from gajim.common.regex import LINK_REGEX
from gajim.common.regex import EMOT_AND_BASIC_REGEX
from gajim.common.regex import EMOT_AND_LINK_REGEX
from gajim.gui import util
from gajim.gui.util import get_cursor
......@@ -584,9 +589,17 @@ def detect_and_print_special_text(self, otext, other_tags, graphics=True,
# basic: links + mail + formatting is always checked (we like that)
if app.settings.get('emoticons_theme') and graphics:
# search for emoticons & urls
iterator = app.interface.emot_and_basic_re.finditer(otext)
else: # search for just urls + mail + formatting
iterator = app.interface.basic_pattern_re.finditer(otext)
if app.settings.get('ascii_formatting'):
regex = EMOT_AND_BASIC_REGEX
else:
regex = EMOT_AND_LINK_REGEX
else:
if app.settings.get('ascii_formatting'):
# search for just urls + mail + formatting
regex = BASIC_REGEX
else: # search for just urls + mail
regex = LINK_REGEX
iterator = regex.finditer(otext)
if iter_:
end_iter = iter_
else:
......@@ -693,7 +706,7 @@ def print_special_text(self, special_text, other_tags, graphics=True,
tags.append('mail')
elif special_text.startswith('xmpp:') and not is_xhtml_link:
tags.append('xmpp')
elif app.interface.sth_at_sth_dot_sth_re.match(special_text) and\
elif STH_AT_STH_DOT_STH_REGEX.match(special_text) and \
not is_xhtml_link:
# it's a JID or mail
tags.append('sth_at_sth')
......
......@@ -28,6 +28,7 @@
from gajim.common import app
from gajim.common.i18n import _
from gajim.common.const import StyleAttr
from gajim.common.regex import LINK_REGEX
from .util import scroll_to_end
......@@ -207,7 +208,7 @@ def make_clickable_urls(self, text):
index = 0
new_text = ''
iterator = app.interface.link_pattern_re.finditer(text)
iterator = LINK_REGEX.finditer(text)
for match in iterator:
start, end = match.span()
url = text[start:end]
......
......@@ -33,7 +33,6 @@
import os
import sys
import re
import time
import json
import logging
......@@ -101,7 +100,6 @@
from gajim.gui.dialogs import InputDialog
from gajim.gui.dialogs import PassphraseDialog
from gajim.gui.filechoosers import FileChooserDialog
from gajim.gui.emoji_data import emoji_data
from gajim.gui.filetransfer import FileTransfersWindow
from gajim.gui.filetransfer_progress import FileTransferProgress
from gajim.gui.roster_item_exchange import RosterItemExchangeWindow
......@@ -1296,112 +1294,6 @@ def handle_event(self, account, fjid, type_):
if isinstance(ctrl, ChatControlBase):
ctrl.scroll_to_end()
################################################################################
### Methods dealing with emoticons
################################################################################
@property
def basic_pattern_re(self):
if not self._basic_pattern_re:
self._basic_pattern_re = re.compile(self.basic_pattern,
re.IGNORECASE)
return self._basic_pattern_re
@property
def emot_and_basic_re(self):
if not self._emot_and_basic_re:
self._emot_and_basic_re = re.compile(
self.emot_and_basic, re.IGNORECASE)
return self._emot_and_basic_re
@property
def sth_at_sth_dot_sth_re(self):
if not self._sth_at_sth_dot_sth_re:
self._sth_at_sth_dot_sth_re = re.compile(self.sth_at_sth_dot_sth)
return self._sth_at_sth_dot_sth_re
@property
def invalid_XML_chars_re(self):
if not self._invalid_XML_chars_re:
self._invalid_XML_chars_re = re.compile(self.invalid_XML_chars)
return self._invalid_XML_chars_re
def make_regexps(self):
# regexp meta characters are: . ^ $ * + ? { } [ ] \ | ( )
# one escapes the metachars with \
# \S matches anything but ' ' '\t' '\n' '\r' '\f' and '\v'
# \s matches any whitespace character
# \w any alphanumeric character
# \W any non-alphanumeric character
# \b means word boundary. This is a zero-width assertion that
# matches only at the beginning or end of a word.
# ^ matches at the beginning of lines
#
# * means 0 or more times
# + means 1 or more times
# ? means 0 or 1 time
# | means or
# [^*] anything but '*' (inside [] you don't have to escape metachars)
# [^\s*] anything but whitespaces and '*'
# (?<!\S) is a one char lookbehind assertion and asks for any leading
# whitespace
# and matches beginning of lines so we have correct formatting detection
# even if the text is just '*foo*'
# (?!\S) is the same thing but it's a lookahead assertion
# \S*[^\s\W] --> in the matching string don't match ? or ) etc.. if at
# the end
# so http://be) will match http://be and http://be)be) will match
# http://be)be
self._basic_pattern_re = None
self._emot_and_basic_re = None
self._sth_at_sth_dot_sth_re = None
self._invalid_XML_chars_re = None
legacy_prefixes = r"((?<=\()(www|ftp)\.([A-Za-z0-9\.\-_~:/\?#\[\]@!\$"\
r"&'\(\)\*\+,;=]|%[A-Fa-f0-9]{2})+(?=\)))"\
r"|((www|ftp)\.([A-Za-z0-9\.\-_~:/\?#\[\]@!\$&'\(\)\*\+,;=]"\
r"|%[A-Fa-f0-9]{2})+"\
r"\.([A-Za-z0-9\.\-_~:/\?#\[\]@!\$&'\(\)\*\+,;=]|%[A-Fa-f0-9]{2})+)"
# NOTE: it's ok to catch www.gr such stuff exist!
# FIXME: recognize xmpp: and treat it specially
links = r"((?<=\()[A-Za-z][A-Za-z0-9\+\.\-]*:"\
r"([\w\.\-_~:/\?#\[\]@!\$&'\(\)\*\+,;=]|%[A-Fa-f0-9]{2})+"\
r"(?=\)))|(\w[\w\+\.\-]*:([^<>\s]|%[A-Fa-f0-9]{2})+)"
# 2nd one: at_least_one_char@at_least_one_char.at_least_one_char
mail = r'\bmailto:\S*[^\s\W]|' r'\b\S+@\S+\.\S*[^\s\W]'
# detects eg. *b* *bold* *bold bold* test *bold* *bold*! (*bold*)
# doesn't detect (it's a feature :P) * bold* *bold * * bold * test*bold*
formatting = r'|(?<!\w)' r'\*[^\s*]' r'([^*]*[^\s*])?' r'\*(?!\w)|'\
r'(?<!\S)' r'~[^\s~]' r'([^~]*[^\s~])?' r'~(?!\S)|'\
r'(?<!\w)' r'_[^\s_]' r'([^_]*[^\s_])?' r'_(?!\w)'
basic_pattern = links + '|' + mail + '|' + legacy_prefixes
link_pattern = basic_pattern
self.link_pattern_re = re.compile(link_pattern, re.I | re.U)
if app.settings.get('ascii_formatting'):
basic_pattern += formatting
self.basic_pattern = basic_pattern
# because emoticons match later (in the string) they need to be after
# basic matches that may occur earlier
emoticons = emoji_data.get_regex()
self.emot_and_basic = '%s|%s' % (basic_pattern, emoticons)
# at least one character in 3 parts (before @, after @, after .)
self.sth_at_sth_dot_sth = r'^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$'
# Invalid XML chars
self.invalid_XML_chars = '[\x00-\x08]|[\x0b-\x0c]|[\x0e-\x1f]|'\
'[\ud800-\udfff]|[\ufffe-\uffff]'
################################################################################
### Methods for opening new messages controls
################################################################################
......@@ -2114,15 +2006,6 @@ def __init__(self):
self.handlers = {}
self.roster = None
self._invalid_XML_chars_re = None
self._basic_pattern_re = None
self._emot_and_basic_re = None
self._sth_at_sth_dot_sth_re = None
self.link_pattern_re = None
self.invalid_XML_chars = None
self.basic_pattern = None
self.emot_and_basic = None
self.sth_at_sth_dot_sth = None
self.avatar_storage = AvatarStorage()
......@@ -2199,8 +2082,6 @@ def __init__(self):
from gajim.gui.emoji_chooser import emoji_chooser
emoji_chooser.load()
self.make_regexps()
self.last_ftwindow_update = 0
self._network_monitor = Gio.NetworkMonitor.get_default()
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment