From 93fabbc18078732443a6d60e5f4b86ce80e092f1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andr=C3=A9=20Apitzsch?= <git@apitzsch.eu>
Date: Sat, 13 Feb 2021 12:44:06 +0100
Subject: [PATCH] Move regular expressions to separate file

---
 gajim/common/helpers.py        |   6 +-
 gajim/common/regex.py          |  88 ++++++++++++++++++++++++
 gajim/conversation_textview.py |  21 ++++--
 gajim/gtk/message_input.py     |   3 +-
 gajim/gui_interface.py         | 119 ---------------------------------
 5 files changed, 111 insertions(+), 126 deletions(-)
 create mode 100644 gajim/common/regex.py

diff --git a/gajim/common/helpers.py b/gajim/common/helpers.py
index 679e41cab7..9f96b119f6 100644
--- a/gajim/common/helpers.py
+++ b/gajim/common/helpers.py
@@ -81,6 +81,8 @@
 from gajim.common.const import URIAction
 from gajim.common.const import GIO_TLS_ERRORS
 from gajim.common.const import SHOW_LIST
+from gajim.common.regex import INVALID_XML_CHARS_REGEX
+from gajim.common.regex import STH_AT_STH_DOT_STH_REGEX
 from gajim.common.structs import URI
 
 
@@ -633,7 +635,7 @@ def get_auth_sha(sid, initiator, target):
 
 def remove_invalid_xml_chars(string_):
     if string_:
-        string_ = re.sub(app.interface.invalid_XML_chars_re, '', string_)
+        string_ = re.sub(INVALID_XML_CHARS_REGEX, '', string_)
     return string_
 
 def get_random_string(count=16):
@@ -1068,7 +1070,7 @@ def parse_uri(uri):
         uri = uri[4:]
         return URI(type=URIType.TEL, data=uri)
 
-    if app.interface.sth_at_sth_dot_sth_re.match(uri):
+    if STH_AT_STH_DOT_STH_REGEX.match(uri):
         return URI(type=URIType.AT, data=uri)
 
     if uri.startswith('geo:'):
diff --git a/gajim/common/regex.py b/gajim/common/regex.py
new file mode 100644
index 0000000000..4025965229
--- /dev/null
+++ b/gajim/common/regex.py
@@ -0,0 +1,88 @@
+import re
+
+def _get_link_pattern():
+    # regexp meta characters are:  . ^ $ * + ? { } [ ] \ | ( )
+    # one escapes the metachars with \
+    # \S matches anything but ' ' '\t' '\n' '\r' '\f' and '\v'
+    # \s matches any whitespace character
+    # \w any alphanumeric character
+    # \W any non-alphanumeric character
+    # \b means word boundary. This is a zero-width assertion that
+    #    matches only at the beginning or end of a word.
+    # ^ matches at the beginning of lines
+    #
+    # * means 0 or more times
+    # + means 1 or more times
+    # ? means 0 or 1 time
+    # | means or
+    # [^*] anything but '*' (inside [] you don't have to escape metachars)
+    # [^\s*] anything but whitespaces and '*'
+    # (?<!\S) is a one char lookbehind assertion and asks for any leading
+    #         whitespace
+    # and matches beginning of lines so we have correct formatting detection
+    # even if the text is just '*foo*'
+    # (?!\S) is the same thing but it's a lookahead assertion
+    # \S*[^\s\W] --> in the matching string don't match ? or ) etc.. if at
+    #                the end
+    # so http://be) will match http://be and http://be)be) will match
+    # http://be)be
+
+    legacy_prefixes = r"((?<=\()(www|ftp)\.([A-Za-z0-9\.\-_~:/\?#\[\]@!\$"\
+        r"&'\(\)\*\+,;=]|%[A-Fa-f0-9]{2})+(?=\)))"\
+        r"|((www|ftp)\.([A-Za-z0-9\.\-_~:/\?#\[\]@!\$&'\(\)\*\+,;=]"\
+        r"|%[A-Fa-f0-9]{2})+"\
+        r"\.([A-Za-z0-9\.\-_~:/\?#\[\]@!\$&'\(\)\*\+,;=]|%[A-Fa-f0-9]{2})+)"
+    # NOTE: it's ok to catch www.gr such stuff exist!
+
+    # FIXME: recognize xmpp: and treat it specially
+    links = r"((?<=\()[A-Za-z][A-Za-z0-9\+\.\-]*:"\
+        r"([\w\.\-_~:/\?#\[\]@!\$&'\(\)\*\+,;=]|%[A-Fa-f0-9]{2})+"\
+        r"(?=\)))|(\w[\w\+\.\-]*:([^<>\s]|%[A-Fa-f0-9]{2})+)"
+
+    # 2nd one: at_least_one_char@at_least_one_char.at_least_one_char
+    mail = r'\bmailto:\S*[^\s\W]|' r'\b\S+@\S+\.\S*[^\s\W]'
+
+    link_pattern = links + '|' + mail + '|' + legacy_prefixes
+    return link_pattern
+
+def _get_basic_pattern():
+    basic_pattern = _get_link_pattern()
+    # detects eg. *b* *bold* *bold bold* test *bold* *bold*! (*bold*)
+    # doesn't detect (it's a feature :P) * bold* *bold * * bold * test*bold*
+    formatting = r'|(?<!\w)' r'\*[^\s*]' r'([^*]*[^\s*])?' r'\*(?!\w)|'\
+        r'(?<!\S)' r'~[^\s~]' r'([^~]*[^\s~])?' r'~(?!\S)|'\
+        r'(?<!\w)' r'_[^\s_]' r'([^_]*[^\s_])?' r'_(?!\w)'
+    return basic_pattern + formatting
+
+def _get_emot_and_basic_pattern(use_ascii_formatting=True):
+    from gajim.gui.emoji_data import emoji_data
+    # because emoticons match later (in the string) they need to be after
+    # basic matches that may occur earlier
+    emoticons = emoji_data.get_regex()
+
+    if use_ascii_formatting:
+        pattern = _get_basic_pattern()
+    else:
+        pattern = _get_link_pattern()
+
+    return '%s|%s' % (pattern, emoticons)
+
+LINK_REGEX = re.compile(_get_link_pattern(), re.I | re.U)
+
+# link pattern + ASCII formatting
+BASIC_REGEX = re.compile(_get_basic_pattern(), re.IGNORECASE)
+
+# emoticons + link pattern
+EMOT_AND_LINK_REGEX = re.compile(_get_emot_and_basic_pattern(False),
+                                          re.IGNORECASE)
+
+# emoticons + link pattern + ASCII formatting
+EMOT_AND_BASIC_REGEX = re.compile(_get_emot_and_basic_pattern(True),
+                                          re.IGNORECASE)
+
+INVALID_XML_CHARS_REGEX = re.compile(
+    '[\x00-\x08]|[\x0b-\x0c]|[\x0e-\x1f]|[\ud800-\udfff]|[\ufffe-\uffff]')
+
+# at least one character in 3 parts (before @, after @, after .)
+STH_AT_STH_DOT_STH_REGEX = re.compile(
+    r'^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$')
diff --git a/gajim/conversation_textview.py b/gajim/conversation_textview.py
index a14c72c08a..55a950c080 100644
--- a/gajim/conversation_textview.py
+++ b/gajim/conversation_textview.py
@@ -44,6 +44,11 @@
 from gajim.common.const import Trust
 from gajim.common.const import URI_SCHEMES
 from gajim.common.helpers import to_user_string
+from gajim.common.regex import STH_AT_STH_DOT_STH_REGEX
+from gajim.common.regex import BASIC_REGEX
+from gajim.common.regex import LINK_REGEX
+from gajim.common.regex import EMOT_AND_BASIC_REGEX
+from gajim.common.regex import EMOT_AND_LINK_REGEX
 
 from gajim.gui import util
 from gajim.gui.util import get_cursor
@@ -584,9 +589,17 @@ def detect_and_print_special_text(self, otext, other_tags, graphics=True,
         # basic: links + mail + formatting is always checked (we like that)
         if app.settings.get('emoticons_theme') and graphics:
             # search for emoticons & urls
-            iterator = app.interface.emot_and_basic_re.finditer(otext)
-        else: # search for just urls + mail + formatting
-            iterator = app.interface.basic_pattern_re.finditer(otext)
+            if app.settings.get('ascii_formatting'):
+                regex = EMOT_AND_BASIC_REGEX
+            else:
+                regex = EMOT_AND_LINK_REGEX
+        else:
+            if app.settings.get('ascii_formatting'):
+                # search for just urls + mail + formatting
+                regex = BASIC_REGEX
+            else: # search for just urls + mail
+                regex = LINK_REGEX
+        iterator = regex.finditer(otext)
         if iter_:
             end_iter = iter_
         else:
@@ -693,7 +706,7 @@ def print_special_text(self, special_text, other_tags, graphics=True,
             tags.append('mail')
         elif special_text.startswith('xmpp:') and not is_xhtml_link:
             tags.append('xmpp')
-        elif app.interface.sth_at_sth_dot_sth_re.match(special_text) and\
+        elif STH_AT_STH_DOT_STH_REGEX.match(special_text) and \
         not is_xhtml_link:
             # it's a JID or mail
             tags.append('sth_at_sth')
diff --git a/gajim/gtk/message_input.py b/gajim/gtk/message_input.py
index bf7a99305a..1141eee3ff 100644
--- a/gajim/gtk/message_input.py
+++ b/gajim/gtk/message_input.py
@@ -28,6 +28,7 @@
 from gajim.common import app
 from gajim.common.i18n import _
 from gajim.common.const import StyleAttr
+from gajim.common.regex import LINK_REGEX
 
 from .util import scroll_to_end
 
@@ -207,7 +208,7 @@ def make_clickable_urls(self, text):
         index = 0
 
         new_text = ''
-        iterator = app.interface.link_pattern_re.finditer(text)
+        iterator = LINK_REGEX.finditer(text)
         for match in iterator:
             start, end = match.span()
             url = text[start:end]
diff --git a/gajim/gui_interface.py b/gajim/gui_interface.py
index da0bbaefa3..1fefdf6131 100644
--- a/gajim/gui_interface.py
+++ b/gajim/gui_interface.py
@@ -33,7 +33,6 @@
 
 import os
 import sys
-import re
 import time
 import json
 import logging
@@ -101,7 +100,6 @@
 from gajim.gui.dialogs import InputDialog
 from gajim.gui.dialogs import PassphraseDialog
 from gajim.gui.filechoosers import FileChooserDialog
-from gajim.gui.emoji_data import emoji_data
 from gajim.gui.filetransfer import FileTransfersWindow
 from gajim.gui.filetransfer_progress import FileTransferProgress
 from gajim.gui.roster_item_exchange import RosterItemExchangeWindow
@@ -1296,112 +1294,6 @@ def handle_event(self, account, fjid, type_):
             if isinstance(ctrl, ChatControlBase):
                 ctrl.scroll_to_end()
 
-################################################################################
-### Methods dealing with emoticons
-################################################################################
-
-    @property
-    def basic_pattern_re(self):
-        if not self._basic_pattern_re:
-            self._basic_pattern_re = re.compile(self.basic_pattern,
-                re.IGNORECASE)
-        return self._basic_pattern_re
-
-    @property
-    def emot_and_basic_re(self):
-        if not self._emot_and_basic_re:
-            self._emot_and_basic_re = re.compile(
-                self.emot_and_basic, re.IGNORECASE)
-        return self._emot_and_basic_re
-
-    @property
-    def sth_at_sth_dot_sth_re(self):
-        if not self._sth_at_sth_dot_sth_re:
-            self._sth_at_sth_dot_sth_re = re.compile(self.sth_at_sth_dot_sth)
-        return self._sth_at_sth_dot_sth_re
-
-    @property
-    def invalid_XML_chars_re(self):
-        if not self._invalid_XML_chars_re:
-            self._invalid_XML_chars_re = re.compile(self.invalid_XML_chars)
-        return self._invalid_XML_chars_re
-
-    def make_regexps(self):
-        # regexp meta characters are:  . ^ $ * + ? { } [ ] \ | ( )
-        # one escapes the metachars with \
-        # \S matches anything but ' ' '\t' '\n' '\r' '\f' and '\v'
-        # \s matches any whitespace character
-        # \w any alphanumeric character
-        # \W any non-alphanumeric character
-        # \b means word boundary. This is a zero-width assertion that
-        #    matches only at the beginning or end of a word.
-        # ^ matches at the beginning of lines
-        #
-        # * means 0 or more times
-        # + means 1 or more times
-        # ? means 0 or 1 time
-        # | means or
-        # [^*] anything but '*' (inside [] you don't have to escape metachars)
-        # [^\s*] anything but whitespaces and '*'
-        # (?<!\S) is a one char lookbehind assertion and asks for any leading
-        #         whitespace
-        # and matches beginning of lines so we have correct formatting detection
-        # even if the text is just '*foo*'
-        # (?!\S) is the same thing but it's a lookahead assertion
-        # \S*[^\s\W] --> in the matching string don't match ? or ) etc.. if at
-        #                the end
-        # so http://be) will match http://be and http://be)be) will match
-        # http://be)be
-
-        self._basic_pattern_re = None
-        self._emot_and_basic_re = None
-        self._sth_at_sth_dot_sth_re = None
-        self._invalid_XML_chars_re = None
-
-        legacy_prefixes = r"((?<=\()(www|ftp)\.([A-Za-z0-9\.\-_~:/\?#\[\]@!\$"\
-            r"&'\(\)\*\+,;=]|%[A-Fa-f0-9]{2})+(?=\)))"\
-            r"|((www|ftp)\.([A-Za-z0-9\.\-_~:/\?#\[\]@!\$&'\(\)\*\+,;=]"\
-            r"|%[A-Fa-f0-9]{2})+"\
-            r"\.([A-Za-z0-9\.\-_~:/\?#\[\]@!\$&'\(\)\*\+,;=]|%[A-Fa-f0-9]{2})+)"
-        # NOTE: it's ok to catch www.gr such stuff exist!
-
-        # FIXME: recognize xmpp: and treat it specially
-        links = r"((?<=\()[A-Za-z][A-Za-z0-9\+\.\-]*:"\
-            r"([\w\.\-_~:/\?#\[\]@!\$&'\(\)\*\+,;=]|%[A-Fa-f0-9]{2})+"\
-            r"(?=\)))|(\w[\w\+\.\-]*:([^<>\s]|%[A-Fa-f0-9]{2})+)"
-
-        # 2nd one: at_least_one_char@at_least_one_char.at_least_one_char
-        mail = r'\bmailto:\S*[^\s\W]|' r'\b\S+@\S+\.\S*[^\s\W]'
-
-        # detects eg. *b* *bold* *bold bold* test *bold* *bold*! (*bold*)
-        # doesn't detect (it's a feature :P) * bold* *bold * * bold * test*bold*
-        formatting = r'|(?<!\w)' r'\*[^\s*]' r'([^*]*[^\s*])?' r'\*(?!\w)|'\
-            r'(?<!\S)' r'~[^\s~]' r'([^~]*[^\s~])?' r'~(?!\S)|'\
-            r'(?<!\w)' r'_[^\s_]' r'([^_]*[^\s_])?' r'_(?!\w)'
-
-        basic_pattern = links + '|' + mail + '|' + legacy_prefixes
-
-        link_pattern = basic_pattern
-        self.link_pattern_re = re.compile(link_pattern, re.I | re.U)
-
-        if app.settings.get('ascii_formatting'):
-            basic_pattern += formatting
-        self.basic_pattern = basic_pattern
-
-        # because emoticons match later (in the string) they need to be after
-        # basic matches that may occur earlier
-        emoticons = emoji_data.get_regex()
-
-        self.emot_and_basic = '%s|%s' % (basic_pattern, emoticons)
-
-        # at least one character in 3 parts (before @, after @, after .)
-        self.sth_at_sth_dot_sth = r'^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$'
-
-        # Invalid XML chars
-        self.invalid_XML_chars = '[\x00-\x08]|[\x0b-\x0c]|[\x0e-\x1f]|'\
-            '[\ud800-\udfff]|[\ufffe-\uffff]'
-
-
 ################################################################################
 ### Methods for opening new messages controls
 ################################################################################
@@ -2114,15 +2006,6 @@ def __init__(self):
 
         self.handlers = {}
         self.roster = None
-        self._invalid_XML_chars_re = None
-        self._basic_pattern_re = None
-        self._emot_and_basic_re = None
-        self._sth_at_sth_dot_sth_re = None
-        self.link_pattern_re = None
-        self.invalid_XML_chars = None
-        self.basic_pattern = None
-        self.emot_and_basic = None
-        self.sth_at_sth_dot_sth = None
 
         self.avatar_storage = AvatarStorage()
 
@@ -2199,8 +2082,6 @@ def __init__(self):
             from gajim.gui.emoji_chooser import emoji_chooser
             emoji_chooser.load()
 
-        self.make_regexps()
-
         self.last_ftwindow_update = 0
 
         self._network_monitor = Gio.NetworkMonitor.get_default()
-- 
GitLab