chat_syntax_highlighter.py 10.9 KB
Newer Older
1 2 3 4 5 6
import logging
import re
import pygments

from gi.repository import Gtk

7 8 9 10
from syntax_highlight.gtkformatter import GTKFormatter
from syntax_highlight.types import MatchType
from syntax_highlight.types import LineBreakOptions
from syntax_highlight.types import CodeMarkerOptions
11

Philipp Hörist's avatar
Philipp Hörist committed
12
log = logging.getLogger('gajim.p.syntax_highlight')
13

14

15 16 17 18 19 20 21 22 23 24 25 26 27 28
class ChatSyntaxHighlighter:
    def hide_code_markup(self, buf, start, end):
        tag = buf.get_tag_table().lookup('hide_code_markup')
        if tag is None:
            tag = Gtk.TextTag.new('hide_code_markup')
            tag.set_property('invisible', True)
            buf.get_tag_table().add(tag)

        buf.apply_tag_by_name('hide_code_markup', start, end)

    def check_line_break(self, is_multiline):
        line_break = self.config.get_line_break_action()

        return (line_break == LineBreakOptions.ALWAYS) \
29
            or (is_multiline and line_break == LineBreakOptions.MULTILINE)
30 31 32 33 34 35 36 37

    def format_code(self, buf, s_tag, s_code, e_tag, e_code, language):
        style = self.config.get_style_name()
        if self.config.get_code_marker_setting() == CodeMarkerOptions.HIDE:
            self.hide_code_markup(buf, s_tag, s_code)
            self.hide_code_markup(buf, e_code, e_tag)
        else:
            comment_tag = GTKFormatter.create_tag_for_token(
38 39
                pygments.token.Comment,
                pygments.styles.get_style_by_name(style))
40 41 42 43 44
            buf.get_tag_table().add(comment_tag)
            buf.apply_tag(comment_tag, s_tag, s_code)
            buf.apply_tag(comment_tag, e_tag, e_code)

        code = s_code.get_text(e_code)
45
        log.debug('full text to encode: %s.', code)
46

47
        start_mark = buf.create_mark(None, s_code, False)
48 49 50 51 52

        lexer = None

        if language is None:
            lexer = self.config.get_default_lexer()
53 54 55
            log.info('No Language specified. '
                     'Falling back to default lexer: %s.',
                     self.config.get_default_lexer_name())
56
        else:
57
            log.debug('Using lexer for %s.', str(language))
58 59 60 61 62 63 64 65 66 67 68 69 70 71
            lexer = self.config.get_lexer_with_fallback(language)

        if lexer is None:
            iterator = buf.get_iter_at_mark(start_mark)
            buf.insert(iterator, '\n')
        elif not self.config.is_internal_none_lexer(lexer):
            tokens = pygments.lex(code, lexer)

            formatter = GTKFormatter(style=style, start_mark=start_mark)
            pygments.format(tokens, formatter, buf)

    def find_multiline_matches(self, text):
        start = None
        matches = []
72 73 74
        # Less strict, allow prefixed whitespaces:
        # for i in re.finditer(r'(?:^|\n)[ |\t]*(```)\S*[ |\t]*(?:\n|$)',
        #     text, re.DOTALL):
75 76 77 78 79
        for i in re.finditer(r'(?:^|\n)(```)\S*(?:\n|$)', text, re.DOTALL):
            if start is None:
                start = i
            elif re.match(r'^\n```', i.group(0)) is not None:
                matches.append(
80
                    (start.start(), i.end(), text[start.start():i.end()]))
81 82 83 84 85 86 87 88 89 90 91 92 93
                start = None
            else:
                # not an end...
                continue
        return matches

    def find_inline_matches(self, text):
        """
        Inline code is highlighted if the start marker is precedded by a start
        of line, a whitespace character or either of the other span markers
        defined in XEP-0393.
        The same applies mirrored to the end marker.
        """
94 95 96
        return [(i.start(1), i.end(1), i.group(1)) for i in
                re.finditer(r'(?:^|\s|\*|~|_)(`((?!`).+?)`)(?:\s|\*|~|_|$)',
                            text)]
97 98 99

    def merge_match_groups(self, real_text, inline_matches, multiline_matches):
        it_inline = iter(inline_matches)
100 101
        it_multi = iter(multiline_matches)
        length = len(real_text)
102 103 104

        # Just to get cleaner code below...
        def get_next(iterator):
105
            return next(iterator, (length, length, ''))
106 107 108

        # In order to simplify the process, we use the 'length' here.
        cur_inline = get_next(it_inline)
109
        cur_multi = get_next(it_multi)
110 111 112

        pos = 0

113 114 115
        # This will contain tuples with parts of the input and its
        # classification
        parts = []
116
        while pos < length:
117 118
            log.debug('-> in: %s', str(cur_inline))
            log.debug('-> mu: %s', str(cur_multi))
119 120

            # selected = (start, end, type)
121 122 123 124 125 126 127
            if cur_inline[0] < cur_multi[0]:
                selected = (cur_inline[0], cur_inline[1], MatchType.INLINE)
            elif cur_multi[0] < length:
                selected = (cur_multi[0], cur_multi[1], MatchType.MULTILINE)
            else:
                selected = (pos, length, MatchType.TEXT)
            log.debug('--> select: %s', str(selected))
128 129 130 131 132 133 134

            # Handle plain text string parts (and unforseen errors...)
            if pos < selected[0]:
                end = selected[0] if selected[0] != pos else selected[1]
                parts.append((real_text[pos:end], MatchType.TEXT))
                pos = selected[0]
            elif pos > selected[0]:
135
                log.error('Should not happen, position > found match.')
136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165

            # Cut out and append selected text segment
            parts.append((real_text[selected[0]:selected[1]], selected[2]))
            pos = selected[1]

            # Depending on the match type, we have to forward the iterators.
            # Also, forward the other one, if regions overlap or we took over...
            if selected[2] == MatchType.INLINE:
                if cur_multi[0] < cur_inline[1]:
                    cur_multi = get_next(it_multi)
                cur_inline = get_next(it_inline)
            elif selected[2] == MatchType.MULTILINE:
                if cur_inline[0] < cur_multi[1]:
                    cur_inline = get_next(it_inline)
                cur_multi = get_next(it_multi)

        return parts

    def process_text(self, real_text, other_tags, _graphics, iter_,
            _additional):
        def fix_newline(char, marker_len_no_newline, force=False):
            fixed = (marker_len_no_newline, '')
            if char == '\n':
                fixed = (marker_len_no_newline + 1, '')
            elif force:
                fixed = (marker_len_no_newline + 1, '\n')
            return fixed

        buf = self.textview.tv.get_buffer()

166 167 168
        # First, try to find inline or multiline code snippets
        inline_matches = self.find_inline_matches(real_text)
        multiline_matches = self.find_multiline_matches(real_text)
169 170

        if not inline_matches and not multiline_matches:
171
            log.debug('Stopping early, since there is no code block in it...')
172 173
            return

174
        iterator = iter_ if iter_ is not None else buf.get_end_iter()
175 176

        # Create a start marker with left gravity before inserting text.
177 178
        start_mark = buf.create_mark('SHP_start', iterator, True)
        end_mark = buf.create_mark('SHP_end', iterator, False)
179

180 181
        insert_newline_for_multiline = self.check_line_break(True)
        insert_newline_for_inline = self.check_line_break(False)
182 183

        split_text = self.merge_match_groups(
184
            real_text, inline_matches, multiline_matches)
185 186 187 188

        buf.begin_user_action()

        for num, (text_to_insert, match_type) in enumerate(split_text):
189 190
            language = None
            end_of_message = num == (len(split_text) - 1)
191 192 193

            if match_type == MatchType.TEXT:
                self.textview.detect_and_print_special_text(
194 195
                    text_to_insert, other_tags, graphics=_graphics,
                    iter_=iterator, additional_data=_additional)
196 197 198
            else:
                if match_type == MatchType.MULTILINE:
                    language_match = re.search(
199 200
                        '\n*```([^\n]*)\n', text_to_insert, re.DOTALL)

201
                    language = None if language_match is None \
202 203
                        else language_match.group(1)

204 205 206
                    language_len = 0 if language is None else len(language)

                    # We account the language word width for the front marker
207 208 209 210 211 212 213 214
                    front = fix_newline(
                        text_to_insert[0],
                        3 + language_len,
                        insert_newline_for_multiline)
                    back = fix_newline(
                        text_to_insert[-1],
                        3,
                        insert_newline_for_multiline and not end_of_message)
215
                else:
216 217 218 219 220 221 222 223
                    front = fix_newline(
                        text_to_insert[0],
                        1,
                        insert_newline_for_inline)
                    back = fix_newline(
                        text_to_insert[-1],
                        1,
                        insert_newline_for_inline and not end_of_message)
224 225 226 227

                marker_widths = (front[0], back[0])
                text_to_insert = ''.join([front[1], text_to_insert, back[1]])

228
                # Insertion invalidates iterator, let's use our start mark...
229 230 231 232
                self.insert_and_format_code(buf, text_to_insert, language,
                        marker_widths, start_mark, end_mark, other_tags)

            iterator = buf.get_iter_at_mark(end_mark)
233
            # The current end of the buffer's contents is the start for the
234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249
            # next iteration
            buf.move_mark(start_mark, iterator)

        buf.delete_mark(start_mark)
        buf.delete_mark(end_mark)

        buf.end_user_action()

        # We have to make sure this is the last thing we do (i.e. no calls to
        # the other textview methods no more from here on), because the
        # print_special_text method is resetting the plugin_modified variable...
        self.textview.plugin_modified = True

    def insert_and_format_code(self, buf, insert_text, language, marker,
            start_mark, end_mark, other_tags=None):

250
        start_iter = buf.get_iter_at_mark(start_mark)
251 252 253 254 255 256 257

        if other_tags:
            buf.insert_with_tags_by_name(start_iter, insert_text,
                    *other_tags)
        else:
            buf.insert(start_iter, insert_text)

258 259 260 261
        tag_start = buf.get_iter_at_mark(start_mark)
        tag_end = buf.get_iter_at_mark(end_mark)
        s_code = tag_start.copy()
        e_code = tag_end.copy()
262 263 264
        s_code.forward_chars(marker[0])
        e_code.backward_chars(marker[1])

265
        log.debug('full text between tags: %s.', tag_start.get_text(tag_end))
266 267 268 269 270 271 272 273 274 275 276 277 278 279 280

        self.format_code(buf, tag_start, s_code, tag_end, e_code, language)

        self.textview.plugin_modified = True

        # Set general code block format
        tag = Gtk.TextTag.new()
        if self.config.is_bgcolor_override_enabled():
            tag.set_property('background', self.config.get_bgcolor())
            tag.set_property('paragraph-background', self.config.get_bgcolor())
        tag.set_property('font', self.config.get_font())
        buf.get_tag_table().add(tag)
        buf.apply_tag(tag, tag_start, tag_end)

    def __init__(self, config, textview):
281 282 283
        self.last_end_mark = None
        self.config = config
        self.textview = textview