chat_syntax_highlighter.py 10.7 KB
Newer Older
1 2 3 4 5 6
import logging
import re
import pygments

from gi.repository import Gtk

7
from gajim.plugins.helpers import log
8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64


from .gtkformatter import GTKFormatter
from .types import MatchType, LineBreakOptions, CodeMarkerOptions


log = logging.getLogger('gajim.plugin_system.syntax_highlight')

class ChatSyntaxHighlighter:
    def hide_code_markup(self, buf, start, end):
        tag = buf.get_tag_table().lookup('hide_code_markup')
        if tag is None:
            tag = Gtk.TextTag.new('hide_code_markup')
            tag.set_property('invisible', True)
            buf.get_tag_table().add(tag)

        buf.apply_tag_by_name('hide_code_markup', start, end)

    def check_line_break(self, is_multiline):
        line_break = self.config.get_line_break_action()

        return (line_break == LineBreakOptions.ALWAYS) \
                or (is_multiline and line_break == LineBreakOptions.MULTILINE)


    def format_code(self, buf, s_tag, s_code, e_tag, e_code, language):
        style = self.config.get_style_name()
        if self.config.get_code_marker_setting() == CodeMarkerOptions.HIDE:
            self.hide_code_markup(buf, s_tag, s_code)
            self.hide_code_markup(buf, e_code, e_tag)
        else:
            comment_tag = GTKFormatter.create_tag_for_token(
                    pygments.token.Comment,
                    pygments.styles.get_style_by_name(style))
            buf.get_tag_table().add(comment_tag)
            buf.apply_tag(comment_tag, s_tag, s_code)
            buf.apply_tag(comment_tag, e_tag, e_code)

        code = s_code.get_text(e_code)
        log.debug("full text to encode: %s.", code)


        start_mark  = buf.create_mark(None, s_code, False)

        lexer = None

        if language is None:
            lexer = self.config.get_default_lexer()
            log.info("No Language specified. Falling back to default lexer: %s.",
                    self.config.get_default_lexer_name())
        else:
            log.debug("Using lexer for %s.", str(language))
            lexer = self.config.get_lexer_with_fallback(language)

        if lexer is None:
            iterator = buf.get_iter_at_mark(start_mark)
            buf.insert(iterator, '\n')
65
        elif not self.config.is_internal_none_lexer(lexer):
66 67 68 69 70 71 72 73
            tokens = pygments.lex(code, lexer)

            formatter = GTKFormatter(style=style, start_mark=start_mark)
            pygments.format(tokens, formatter, buf)

    def find_multiline_matches(self, text):
        start = None
        matches = []
74 75
        #Less strict, allow prefixed whitespaces: for i in re.finditer(r'(?:^|\n)[ |\t]*(```)\S*[ |\t]*(?:\n|$)', text, re.DOTALL):
        for i in re.finditer(r'(?:^|\n)(```)\S*(?:\n|$)', text, re.DOTALL):
76 77 78 79 80 81 82 83 84 85 86 87
            if start is None:
                start = i
            elif re.match(r'^\n```', i.group(0)) is not None:
                matches.append(
                        (start.start(), i.end(), text[start.start():i.end()]))
                start = None
            else:
                # not an end...
                continue
        return matches

    def find_inline_matches(self, text):
88 89 90 91 92 93 94 95
        """
        Inline code is highlighted if the start marker is precedded by a start
        of line, a whitespace character or either of the other span markers
        defined in XEP-0393.
        The same applies mirrored to the end marker.
        """
        return [(i.start(1), i.end(1), i.group(1)) for i in \
                re.finditer(r'(?:^|\s|\*|~|_)(`((?!`).+?)`)(?:\s|\*|~|_|$)', text)]
96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217

    def merge_match_groups(self, real_text, inline_matches, multiline_matches):
        it_inline = iter(inline_matches)
        it_multi  = iter(multiline_matches)
        length    = len(real_text)

        # Just to get cleaner code below...
        def get_next(iterator):
            return next(iterator, (length, length, ""))

        # In order to simplify the process, we use the 'length' here.
        cur_inline = get_next(it_inline)
        cur_multi  = get_next(it_multi)

        pos = 0

        # This will contain tuples with parts of the input and its classification
        parts  = []
        while pos < length:
            log.debug("-> in: %s", str(cur_inline))
            log.debug("-> mu: %s", str(cur_multi))

            # selected = (start, end, type)
            selected = (cur_inline[0], cur_inline[1], MatchType.INLINE) \
                    if cur_inline[0] < cur_multi[0] \
                    else (cur_multi[0], cur_multi[1], MatchType.MULTILINE) \
                        if cur_multi[0] < length \
                        else (pos, length, MatchType.TEXT)
            log.debug("--> select: %s", str(selected))

            # Handle plain text string parts (and unforseen errors...)
            if pos < selected[0]:
                end = selected[0] if selected[0] != pos else selected[1]
                parts.append((real_text[pos:end], MatchType.TEXT))
                pos = selected[0]
            elif pos > selected[0]:
                log.error("Should not happen, position > found match.")

            # Cut out and append selected text segment
            parts.append((real_text[selected[0]:selected[1]], selected[2]))
            pos = selected[1]

            # Depending on the match type, we have to forward the iterators.
            # Also, forward the other one, if regions overlap or we took over...
            if selected[2] == MatchType.INLINE:
                if cur_multi[0] < cur_inline[1]:
                    cur_multi = get_next(it_multi)
                cur_inline = get_next(it_inline)
            elif selected[2] == MatchType.MULTILINE:
                if cur_inline[0] < cur_multi[1]:
                    cur_inline = get_next(it_inline)
                cur_multi = get_next(it_multi)

        return parts

    def process_text(self, real_text, other_tags, _graphics, iter_,
            _additional):
        def fix_newline(char, marker_len_no_newline, force=False):
            fixed = (marker_len_no_newline, '')
            if char == '\n':
                fixed = (marker_len_no_newline + 1, '')
            elif force:
                fixed = (marker_len_no_newline + 1, '\n')
            return fixed


        buf = self.textview.tv.get_buffer()

        # first, try to find inline or multiline code snippets
        inline_matches      = self.find_inline_matches(real_text)
        multiline_matches   = self.find_multiline_matches(real_text)

        if not inline_matches and not multiline_matches:
            log.debug("Stopping early, since there is no code block in it....")
            return

        iterator   = iter_ if iter_ is not None else buf.get_end_iter()

        # Create a start marker with left gravity before inserting text.
        start_mark = buf.create_mark("SHP_start", iterator, True)
        end_mark   = buf.create_mark("SHP_end", iterator, False)

        insert_newline_for_multiline    = self.check_line_break(True)
        insert_newline_for_inline       = self.check_line_break(False)

        split_text = self.merge_match_groups(
                real_text, inline_matches, multiline_matches)

        buf.begin_user_action()

        for num, (text_to_insert, match_type) in enumerate(split_text):
            language        = None
            end_of_message  = num == (len(split_text) - 1)

            if match_type == MatchType.TEXT:
                self.textview.detect_and_print_special_text(
                        text_to_insert, other_tags, graphics=_graphics,
                        iter_=iterator, additional_data=_additional)
            else:
                if match_type == MatchType.MULTILINE:
                    language_match = re.search(
                            '\n*```([^\n]*)\n', text_to_insert, re.DOTALL)
                    language = None if language_match is None \
                            else language_match.group(1)
                    language_len = 0 if language is None else len(language)

                    # We account the language word width for the front marker
                    front = fix_newline(text_to_insert[0],  3 + language_len,
                            insert_newline_for_multiline)
                    back  = fix_newline(text_to_insert[-1], 3,
                            insert_newline_for_multiline and not end_of_message)
                else:
                    front = fix_newline(text_to_insert[0],  1,
                            insert_newline_for_inline)
                    back  = fix_newline(text_to_insert[-1], 1,
                            insert_newline_for_inline and not end_of_message)

                marker_widths = (front[0], back[0])
                text_to_insert = ''.join([front[1], text_to_insert, back[1]])

                # insertion invalidates iterator, let's use our start mark...
                self.insert_and_format_code(buf, text_to_insert, language,
218
                        marker_widths, start_mark, end_mark, other_tags)
219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234

            iterator = buf.get_iter_at_mark(end_mark)
            # the current end of the buffer's contents is the start for the
            # next iteration
            buf.move_mark(start_mark, iterator)

        buf.delete_mark(start_mark)
        buf.delete_mark(end_mark)

        buf.end_user_action()

        # We have to make sure this is the last thing we do (i.e. no calls to
        # the other textview methods no more from here on), because the
        # print_special_text method is resetting the plugin_modified variable...
        self.textview.plugin_modified = True

235 236 237
    def insert_and_format_code(self, buf, insert_text, language, marker,
            start_mark, end_mark, other_tags=None):

238 239 240 241 242 243 244 245
        start_iter  = buf.get_iter_at_mark(start_mark)

        if other_tags:
            buf.insert_with_tags_by_name(start_iter, insert_text,
                    *other_tags)
        else:
            buf.insert(start_iter, insert_text)

246
        tag_start   = buf.get_iter_at_mark(start_mark)
247
        tag_end     = buf.get_iter_at_mark(end_mark)
248
        s_code      = tag_start.copy()
249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265
        e_code      = tag_end.copy()
        s_code.forward_chars(marker[0])
        e_code.backward_chars(marker[1])

        log.debug("full text between tags: %s.", tag_start.get_text(tag_end))

        self.format_code(buf, tag_start, s_code, tag_end, e_code, language)

        self.textview.plugin_modified = True

        # Set general code block format
        tag = Gtk.TextTag.new()
        if self.config.is_bgcolor_override_enabled():
            tag.set_property('background', self.config.get_bgcolor())
            tag.set_property('paragraph-background', self.config.get_bgcolor())
        tag.set_property('font', self.config.get_font())
        buf.get_tag_table().add(tag)
266
        buf.apply_tag(tag, tag_start, tag_end)
267 268 269 270 271

    def __init__(self, config, textview):
        self.last_end_mark  = None
        self.config         = config
        self.textview       = textview