From 93fabbc18078732443a6d60e5f4b86ce80e092f1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Apitzsch?= <git@apitzsch.eu> Date: Sat, 13 Feb 2021 12:44:06 +0100 Subject: [PATCH] Move regular expressions to separate file --- gajim/common/helpers.py | 6 +- gajim/common/regex.py | 88 ++++++++++++++++++++++++ gajim/conversation_textview.py | 21 ++++-- gajim/gtk/message_input.py | 3 +- gajim/gui_interface.py | 119 --------------------------------- 5 files changed, 111 insertions(+), 126 deletions(-) create mode 100644 gajim/common/regex.py diff --git a/gajim/common/helpers.py b/gajim/common/helpers.py index 679e41cab7..9f96b119f6 100644 --- a/gajim/common/helpers.py +++ b/gajim/common/helpers.py @@ -81,6 +81,8 @@ from gajim.common.const import URIAction from gajim.common.const import GIO_TLS_ERRORS from gajim.common.const import SHOW_LIST +from gajim.common.regex import INVALID_XML_CHARS_REGEX +from gajim.common.regex import STH_AT_STH_DOT_STH_REGEX from gajim.common.structs import URI @@ -633,7 +635,7 @@ def get_auth_sha(sid, initiator, target): def remove_invalid_xml_chars(string_): if string_: - string_ = re.sub(app.interface.invalid_XML_chars_re, '', string_) + string_ = re.sub(INVALID_XML_CHARS_REGEX, '', string_) return string_ def get_random_string(count=16): @@ -1068,7 +1070,7 @@ def parse_uri(uri): uri = uri[4:] return URI(type=URIType.TEL, data=uri) - if app.interface.sth_at_sth_dot_sth_re.match(uri): + if STH_AT_STH_DOT_STH_REGEX.match(uri): return URI(type=URIType.AT, data=uri) if uri.startswith('geo:'): diff --git a/gajim/common/regex.py b/gajim/common/regex.py new file mode 100644 index 0000000000..4025965229 --- /dev/null +++ b/gajim/common/regex.py @@ -0,0 +1,88 @@ +import re + +def _get_link_pattern(): + # regexp meta characters are: . ^ $ * + ? { } [ ] \ | ( ) + # one escapes the metachars with \ + # \S matches anything but ' ' '\t' '\n' '\r' '\f' and '\v' + # \s matches any whitespace character + # \w any alphanumeric character + # \W any non-alphanumeric character + # \b means word boundary. This is a zero-width assertion that + # matches only at the beginning or end of a word. + # ^ matches at the beginning of lines + # + # * means 0 or more times + # + means 1 or more times + # ? means 0 or 1 time + # | means or + # [^*] anything but '*' (inside [] you don't have to escape metachars) + # [^\s*] anything but whitespaces and '*' + # (?<!\S) is a one char lookbehind assertion and asks for any leading + # whitespace + # and matches beginning of lines so we have correct formatting detection + # even if the text is just '*foo*' + # (?!\S) is the same thing but it's a lookahead assertion + # \S*[^\s\W] --> in the matching string don't match ? or ) etc.. if at + # the end + # so http://be) will match http://be and http://be)be) will match + # http://be)be + + legacy_prefixes = r"((?<=\()(www|ftp)\.([A-Za-z0-9\.\-_~:/\?#\[\]@!\$"\ + r"&'\(\)\*\+,;=]|%[A-Fa-f0-9]{2})+(?=\)))"\ + r"|((www|ftp)\.([A-Za-z0-9\.\-_~:/\?#\[\]@!\$&'\(\)\*\+,;=]"\ + r"|%[A-Fa-f0-9]{2})+"\ + r"\.([A-Za-z0-9\.\-_~:/\?#\[\]@!\$&'\(\)\*\+,;=]|%[A-Fa-f0-9]{2})+)" + # NOTE: it's ok to catch www.gr such stuff exist! + + # FIXME: recognize xmpp: and treat it specially + links = r"((?<=\()[A-Za-z][A-Za-z0-9\+\.\-]*:"\ + r"([\w\.\-_~:/\?#\[\]@!\$&'\(\)\*\+,;=]|%[A-Fa-f0-9]{2})+"\ + r"(?=\)))|(\w[\w\+\.\-]*:([^<>\s]|%[A-Fa-f0-9]{2})+)" + + # 2nd one: at_least_one_char@at_least_one_char.at_least_one_char + mail = r'\bmailto:\S*[^\s\W]|' r'\b\S+@\S+\.\S*[^\s\W]' + + link_pattern = links + '|' + mail + '|' + legacy_prefixes + return link_pattern + +def _get_basic_pattern(): + basic_pattern = _get_link_pattern() + # detects eg. *b* *bold* *bold bold* test *bold* *bold*! (*bold*) + # doesn't detect (it's a feature :P) * bold* *bold * * bold * test*bold* + formatting = r'|(?<!\w)' r'\*[^\s*]' r'([^*]*[^\s*])?' r'\*(?!\w)|'\ + r'(?<!\S)' r'~[^\s~]' r'([^~]*[^\s~])?' r'~(?!\S)|'\ + r'(?<!\w)' r'_[^\s_]' r'([^_]*[^\s_])?' r'_(?!\w)' + return basic_pattern + formatting + +def _get_emot_and_basic_pattern(use_ascii_formatting=True): + from gajim.gui.emoji_data import emoji_data + # because emoticons match later (in the string) they need to be after + # basic matches that may occur earlier + emoticons = emoji_data.get_regex() + + if use_ascii_formatting: + pattern = _get_basic_pattern() + else: + pattern = _get_link_pattern() + + return '%s|%s' % (pattern, emoticons) + +LINK_REGEX = re.compile(_get_link_pattern(), re.I | re.U) + +# link pattern + ASCII formatting +BASIC_REGEX = re.compile(_get_basic_pattern(), re.IGNORECASE) + +# emoticons + link pattern +EMOT_AND_LINK_REGEX = re.compile(_get_emot_and_basic_pattern(False), + re.IGNORECASE) + +# emoticons + link pattern + ASCII formatting +EMOT_AND_BASIC_REGEX = re.compile(_get_emot_and_basic_pattern(True), + re.IGNORECASE) + +INVALID_XML_CHARS_REGEX = re.compile( + '[\x00-\x08]|[\x0b-\x0c]|[\x0e-\x1f]|[\ud800-\udfff]|[\ufffe-\uffff]') + +# at least one character in 3 parts (before @, after @, after .) +STH_AT_STH_DOT_STH_REGEX = re.compile( + r'^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$') diff --git a/gajim/conversation_textview.py b/gajim/conversation_textview.py index a14c72c08a..55a950c080 100644 --- a/gajim/conversation_textview.py +++ b/gajim/conversation_textview.py @@ -44,6 +44,11 @@ from gajim.common.const import Trust from gajim.common.const import URI_SCHEMES from gajim.common.helpers import to_user_string +from gajim.common.regex import STH_AT_STH_DOT_STH_REGEX +from gajim.common.regex import BASIC_REGEX +from gajim.common.regex import LINK_REGEX +from gajim.common.regex import EMOT_AND_BASIC_REGEX +from gajim.common.regex import EMOT_AND_LINK_REGEX from gajim.gui import util from gajim.gui.util import get_cursor @@ -584,9 +589,17 @@ def detect_and_print_special_text(self, otext, other_tags, graphics=True, # basic: links + mail + formatting is always checked (we like that) if app.settings.get('emoticons_theme') and graphics: # search for emoticons & urls - iterator = app.interface.emot_and_basic_re.finditer(otext) - else: # search for just urls + mail + formatting - iterator = app.interface.basic_pattern_re.finditer(otext) + if app.settings.get('ascii_formatting'): + regex = EMOT_AND_BASIC_REGEX + else: + regex = EMOT_AND_LINK_REGEX + else: + if app.settings.get('ascii_formatting'): + # search for just urls + mail + formatting + regex = BASIC_REGEX + else: # search for just urls + mail + regex = LINK_REGEX + iterator = regex.finditer(otext) if iter_: end_iter = iter_ else: @@ -693,7 +706,7 @@ def print_special_text(self, special_text, other_tags, graphics=True, tags.append('mail') elif special_text.startswith('xmpp:') and not is_xhtml_link: tags.append('xmpp') - elif app.interface.sth_at_sth_dot_sth_re.match(special_text) and\ + elif STH_AT_STH_DOT_STH_REGEX.match(special_text) and \ not is_xhtml_link: # it's a JID or mail tags.append('sth_at_sth') diff --git a/gajim/gtk/message_input.py b/gajim/gtk/message_input.py index bf7a99305a..1141eee3ff 100644 --- a/gajim/gtk/message_input.py +++ b/gajim/gtk/message_input.py @@ -28,6 +28,7 @@ from gajim.common import app from gajim.common.i18n import _ from gajim.common.const import StyleAttr +from gajim.common.regex import LINK_REGEX from .util import scroll_to_end @@ -207,7 +208,7 @@ def make_clickable_urls(self, text): index = 0 new_text = '' - iterator = app.interface.link_pattern_re.finditer(text) + iterator = LINK_REGEX.finditer(text) for match in iterator: start, end = match.span() url = text[start:end] diff --git a/gajim/gui_interface.py b/gajim/gui_interface.py index da0bbaefa3..1fefdf6131 100644 --- a/gajim/gui_interface.py +++ b/gajim/gui_interface.py @@ -33,7 +33,6 @@ import os import sys -import re import time import json import logging @@ -101,7 +100,6 @@ from gajim.gui.dialogs import InputDialog from gajim.gui.dialogs import PassphraseDialog from gajim.gui.filechoosers import FileChooserDialog -from gajim.gui.emoji_data import emoji_data from gajim.gui.filetransfer import FileTransfersWindow from gajim.gui.filetransfer_progress import FileTransferProgress from gajim.gui.roster_item_exchange import RosterItemExchangeWindow @@ -1296,112 +1294,6 @@ def handle_event(self, account, fjid, type_): if isinstance(ctrl, ChatControlBase): ctrl.scroll_to_end() -################################################################################ -### Methods dealing with emoticons -################################################################################ - - @property - def basic_pattern_re(self): - if not self._basic_pattern_re: - self._basic_pattern_re = re.compile(self.basic_pattern, - re.IGNORECASE) - return self._basic_pattern_re - - @property - def emot_and_basic_re(self): - if not self._emot_and_basic_re: - self._emot_and_basic_re = re.compile( - self.emot_and_basic, re.IGNORECASE) - return self._emot_and_basic_re - - @property - def sth_at_sth_dot_sth_re(self): - if not self._sth_at_sth_dot_sth_re: - self._sth_at_sth_dot_sth_re = re.compile(self.sth_at_sth_dot_sth) - return self._sth_at_sth_dot_sth_re - - @property - def invalid_XML_chars_re(self): - if not self._invalid_XML_chars_re: - self._invalid_XML_chars_re = re.compile(self.invalid_XML_chars) - return self._invalid_XML_chars_re - - def make_regexps(self): - # regexp meta characters are: . ^ $ * + ? { } [ ] \ | ( ) - # one escapes the metachars with \ - # \S matches anything but ' ' '\t' '\n' '\r' '\f' and '\v' - # \s matches any whitespace character - # \w any alphanumeric character - # \W any non-alphanumeric character - # \b means word boundary. This is a zero-width assertion that - # matches only at the beginning or end of a word. - # ^ matches at the beginning of lines - # - # * means 0 or more times - # + means 1 or more times - # ? means 0 or 1 time - # | means or - # [^*] anything but '*' (inside [] you don't have to escape metachars) - # [^\s*] anything but whitespaces and '*' - # (?<!\S) is a one char lookbehind assertion and asks for any leading - # whitespace - # and matches beginning of lines so we have correct formatting detection - # even if the text is just '*foo*' - # (?!\S) is the same thing but it's a lookahead assertion - # \S*[^\s\W] --> in the matching string don't match ? or ) etc.. if at - # the end - # so http://be) will match http://be and http://be)be) will match - # http://be)be - - self._basic_pattern_re = None - self._emot_and_basic_re = None - self._sth_at_sth_dot_sth_re = None - self._invalid_XML_chars_re = None - - legacy_prefixes = r"((?<=\()(www|ftp)\.([A-Za-z0-9\.\-_~:/\?#\[\]@!\$"\ - r"&'\(\)\*\+,;=]|%[A-Fa-f0-9]{2})+(?=\)))"\ - r"|((www|ftp)\.([A-Za-z0-9\.\-_~:/\?#\[\]@!\$&'\(\)\*\+,;=]"\ - r"|%[A-Fa-f0-9]{2})+"\ - r"\.([A-Za-z0-9\.\-_~:/\?#\[\]@!\$&'\(\)\*\+,;=]|%[A-Fa-f0-9]{2})+)" - # NOTE: it's ok to catch www.gr such stuff exist! - - # FIXME: recognize xmpp: and treat it specially - links = r"((?<=\()[A-Za-z][A-Za-z0-9\+\.\-]*:"\ - r"([\w\.\-_~:/\?#\[\]@!\$&'\(\)\*\+,;=]|%[A-Fa-f0-9]{2})+"\ - r"(?=\)))|(\w[\w\+\.\-]*:([^<>\s]|%[A-Fa-f0-9]{2})+)" - - # 2nd one: at_least_one_char@at_least_one_char.at_least_one_char - mail = r'\bmailto:\S*[^\s\W]|' r'\b\S+@\S+\.\S*[^\s\W]' - - # detects eg. *b* *bold* *bold bold* test *bold* *bold*! (*bold*) - # doesn't detect (it's a feature :P) * bold* *bold * * bold * test*bold* - formatting = r'|(?<!\w)' r'\*[^\s*]' r'([^*]*[^\s*])?' r'\*(?!\w)|'\ - r'(?<!\S)' r'~[^\s~]' r'([^~]*[^\s~])?' r'~(?!\S)|'\ - r'(?<!\w)' r'_[^\s_]' r'([^_]*[^\s_])?' r'_(?!\w)' - - basic_pattern = links + '|' + mail + '|' + legacy_prefixes - - link_pattern = basic_pattern - self.link_pattern_re = re.compile(link_pattern, re.I | re.U) - - if app.settings.get('ascii_formatting'): - basic_pattern += formatting - self.basic_pattern = basic_pattern - - # because emoticons match later (in the string) they need to be after - # basic matches that may occur earlier - emoticons = emoji_data.get_regex() - - self.emot_and_basic = '%s|%s' % (basic_pattern, emoticons) - - # at least one character in 3 parts (before @, after @, after .) - self.sth_at_sth_dot_sth = r'^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$' - - # Invalid XML chars - self.invalid_XML_chars = '[\x00-\x08]|[\x0b-\x0c]|[\x0e-\x1f]|'\ - '[\ud800-\udfff]|[\ufffe-\uffff]' - - ################################################################################ ### Methods for opening new messages controls ################################################################################ @@ -2114,15 +2006,6 @@ def __init__(self): self.handlers = {} self.roster = None - self._invalid_XML_chars_re = None - self._basic_pattern_re = None - self._emot_and_basic_re = None - self._sth_at_sth_dot_sth_re = None - self.link_pattern_re = None - self.invalid_XML_chars = None - self.basic_pattern = None - self.emot_and_basic = None - self.sth_at_sth_dot_sth = None self.avatar_storage = AvatarStorage() @@ -2199,8 +2082,6 @@ def __init__(self): from gajim.gui.emoji_chooser import emoji_chooser emoji_chooser.load() - self.make_regexps() - self.last_ftwindow_update = 0 self._network_monitor = Gio.NetworkMonitor.get_default() -- GitLab