# .. Copyright (C) 2012-2016 Bryan A. Jones. # # This file is part of CodeChat. # # CodeChat is free software: you can redistribute it and/or modify it under # the terms of the GNU General Public License as published by the Free # Software Foundation, either version 3 of the License, or (at your option) # any later version. # # CodeChat is distributed in the hope that it will be useful, but WITHOUT ANY # WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS # FOR A PARTICULAR PURPOSE. See the GNU General Public License for more # details. # # You should have received a copy of the GNU General Public License along # with CodeChat. If not, see . # # ********************************************************* # CodeToRest.py - a module to translate source code to reST # ********************************************************* # The API_ lists four function which convert source code into either reST or # HTML. For a discussion on how this is accomplished, the lexer_to_rest_ # function forms the core of the alogorithm; Step_5_ gives a detailed # explanation of how the code is translated to reST. # # .. contents:: # # Imports # ======= # These are listed in the order prescribed by `PEP 8 # `_. # # Standard library # ---------------- # For calling code_to_rest with a string. While using cStringIO would be # great, it doesn't support Unicode, so we can't. from io import StringIO # To find a file's extension and locate data files. import os.path # For enumerations. from enum import Enum # # Third-party imports # ------------------- # Used to open files with unknown encodings and to run docutils itself. from docutils import io, core # Import CodeBlock as a base class for FencedCodeBlock. from docutils.parsers.rst.directives.body import CodeBlock # Import directives to register the new FencedCodeBlock and SetLine directives. from docutils.parsers.rst import directives, Directive # For the docutils default stylesheet and template import docutils.writers.html4css1 from docutils.writers.html4css1 import Writer from pygments import lex from pygments.lexers import get_lexer_for_filename, get_lexer_by_name, \ get_lexer_for_mimetype, guess_lexer_for_filename, guess_lexer from pygments.token import Token # # Local application imports # ------------------------- from .CommentDelimiterInfo import COMMENT_DELIMITER_INFO # # API # === # The following routines provide easy access to the core functionality of this # module: code_to_rest_string_, code_to_rest_file_, code_to_html_string_, and # code_to_html_file_. # # .. _code_to_rest_string: # # This function converts a string containg code to reST, returning the result # as a string. def code_to_rest_string( # .. _code_str: # # The code to translate to reST. code_str, # .. _options: # # Specify the lexer (see get_lexer_ arguments), and provide it any other # needed options. **options): # Use a StringIO to capture writes into a string. output_rst = StringIO() _lexer_to_rest(code_str, get_lexer(code=code_str, **options), output_rst) return output_rst.getvalue() # .. _code_to_rest_file: # # Convert a source file to a reST file. def code_to_rest_file( # .. _source_path: # # Path to a source code file to process. source_path, # Path to a destination reST file to create. It will be overwritten if it # already exists. rst_path, # .. _input_encoding: # # Encoding to use for the input file. The default of None detects the encoding # of the input file. input_encoding=None, # .. _output_encoding: # # Encoding to use for the output file. output_encoding='utf-8', # See options_. **options): # Use docutil's I/O classes to better handle and sniff encodings. # # Note: both these classes automatically close themselves after a # read or write. fi = io.FileInput(source_path=source_path, encoding=input_encoding) fo = io.FileOutput(destination_path=rst_path, encoding=output_encoding) code_str = fi.read() lexer = get_lexer(filename=source_path, code=code_str, **options) rst = code_to_rest_string(code_str, lexer=lexer) fo.write(rst) # .. _code_to_html_string: # # This converts a string containing source code to HTML, which it returns as a # string. def code_to_html_string( # See code_str_. code_str, # A file-like object where warnings and errors will be written, or None to # send them to stderr. warning_stream=None, # See options_. **options): rest = code_to_rest_string(code_str, **options) # `docutils # `_ # converts reST to HTML. html = core.publish_string(rest, writer_name='html', settings_overrides={ # Include our custom css file: provide the path to the default css and # then to our css. The stylesheet dirs must include docutils defaults. # However, Write.default_stylesheet_dirs doesn't work when frozen, # because (I think) it relies on a relative path wihch the frozen # environment doesn't have. So, rebuild that path manually. 'stylesheet_path': Writer.default_stylesheet + ',CodeChat.css', 'stylesheet_dirs': ['.', os.path.dirname(docutils.writers.html4css1.__file__), os.path.join(os.path.dirname(__file__), 'template')], # The default template uses a relative path, which doesn't work when frozen ???. 'template': os.path.join(os.path.dirname(docutils.writers.html4css1.__file__), Writer.default_template), # Make sure to use Unicode everywhere. 'output_encoding': 'unicode', 'input_encoding' : 'unicode', # Don't stop processing, no matter what. 'halt_level' : 5, # Capture errors to a string and return it. 'warning_stream' : warning_stream}) return html # .. _code_to_html_file: # # Convert source code stored in a file to HTML, which is saved in another file. def code_to_html_file( # See source_path_. source_path, # Destination file name to hold the generated HTML. This file will be # overwritten. If not supplied, *source_path*\ ``.html`` will be assumed. html_path=None, # See input_encoding_. input_encoding=None, # See output_encoding_. output_encoding='utf-8'): html_path = html_path or source_path + '.html' fi = io.FileInput(source_path=source_path, encoding=input_encoding) fo = io.FileOutput(destination_path=html_path, encoding=output_encoding) code_str = fi.read() lexer = get_lexer(filename=source_path, code=code_str) html = code_to_html_string(code_str, lexer=lexer) fo.write(html) # # # Supporting routines # ------------------- # # .. _get_lexer: # # Provide several ways to find a lexer. Provide any of the following arguments, # and this function will return the appropriate lexer for it. def get_lexer( # The lexer itself, which will simply be returned. lexer=None, # The `short name `_, or alias, of the # lexer to use. alias=None, # The filename of the source file to lex. filename=None, # The MIME type of the source file to lex. mimetype=None, # The code to be highlighted, used to guess a lexer. code=None, # See options_. **options): if lexer: return lexer if alias: return get_lexer_by_name(alias, **options) if filename: if code: return guess_lexer_for_filename(filename, code, **options) else: return get_lexer_for_filename(filename, **options) if mimetype: return get_lexer_for_mimetype(mimetype, **options) if code: return guess_lexer(code, **options) # Provide the ability to print debug info if needed. def _debug_print(val): # Uncomment for debug prints. #print(val), pass # # # .. _lexer_to_rest: # # Implementation # ============== # This routine transforms source code to reST, preserving all indentations of # both source code and comments. To do so, the comment characters are stripped # from reST-formatted comments and all code is placed inside code blocks. # # **This routine is the heart of the algorithm.** def _lexer_to_rest( # See code_str_. code_str, # .. _lexer: # # The lexer used to analyze the code. lexer, # See out_file_. out_file): _debug_print('Lexer: {}\n'.format(lexer.name)) # Gather some additional information, based on the lexer, which is needed # to correctly process comments: cdi = COMMENT_DELIMITER_INFO[lexer.name] # * If there's no multi-line start info, then classify generic comments as # inline. comment_is_inline = not cdi[1] # * Likewise, no inline info indicates that generic comments are block # comments. comment_is_block = not cdi[0] # \1. Invoke a Pygments lexer on the provided source code, obtaining an # iterable of tokens. token_iter = lex(code_str, lexer) # \2. Combine tokens from the lexer into three groups: whitespace, comment, # or other. token_group = _group_lexer_tokens(token_iter, comment_is_inline, comment_is_block) # \3. Make a per-line list of [group, ws_len, string], so that the last # string in each list ends with a newline. Change the group of block # comments that actually span multiple lines. gathered_group = _gather_groups_on_newlines(token_group, cdi) # \4. Classify each line. For reST-formatted comments, remove the leading # whitespace and all comment characters (the // or #, for example). # # Then classify. classified_group = _classify_groups(gathered_group, cdi) # \5. Run a state machine to output the corresponding reST. _generate_rest(classified_group, out_file) # # # Step 2 of lexer_to_rest_ # ------------------------ # Given tokens, group them. def _group_lexer_tokens( # An interable of (tokentype, string) pairs provided by the lexer, per # `get_tokens # `_. iter_token, # .. _comment_is_inline: # # When true, classify generic comments as inline. comment_is_inline, # .. _comment_is_block: # # When true, classify generic comment as block comments. comment_is_block): # Keep track of the current group and string. tokentype, current_string = next(iter_token) current_group = _group_for_tokentype(tokentype, comment_is_inline, comment_is_block) _debug_print('tokentype = {}, string = {}\n'. format(tokentype, [current_string])) # Walk through tokens. for tokentype, string in iter_token: group = _group_for_tokentype(tokentype, comment_is_inline, comment_is_block) _debug_print('tokentype = {}, string = {}\n'. format(tokentype, [string])) # If there's a change in group, yield what we've accumulated so far, # then initialize the state to the newly-found group and string. if current_group != group: yield current_group, current_string current_group = group current_string = string # Otherwise, keep accumulating. else: current_string += string # Output final pair, if we have it. if current_string: yield current_group, current_string # # # Supporting routines and definitions # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ # Define the groups into which tokens will be placed. class _GROUP(Enum): # The basic classification used by group_for_tokentype_. whitespace = 1 inline_comment = 2 other = 3 # A ``/* comment */``-style comment contained in one string. block_comment = 4 # Grouping is:: # # /* BLOCK_COMMENT_START # BLOCK_COMMENT_BODY, (repeats for all comment body) # BLOCK_COMMENT_END */ block_comment_start = 5 block_comment_body = 6 block_comment_end = 7 # # .. _group_for_tokentype: # # Given a tokentype, group it. def _group_for_tokentype( # The tokentype to place into a group. tokentype, # See comment_is_inline_. comment_is_inline, # See comment_is_block_. comment_is_block): # The list of Pygments `tokens `_ lists # ``Token.Text`` (how a newline is classified) and ``Token.Whitespace``. # Consider either as whitespace. if tokentype in Token.Text or tokentype in Token.Whitespace: return _GROUP.whitespace # There is a Token.Comment, but this can refer to inline or block comments. # Therefore, use info from CommentDelimiterInfo as a tiebreaker. if (tokentype == Token.Comment.Single or (tokentype == Token.Comment and comment_is_inline) ): return _GROUP.inline_comment if (tokentype == Token.Comment.Multiline or (tokentype == Token.Comment and comment_is_block) ): return _GROUP.block_comment # If the tiebreaker above doesn't classify a Token.Comment, then assume it # to be an inline comment. This occurs in the Matlab lexer using Pygments # 2.0.2. if tokentype == Token.Comment: return _GROUP.inline_comment # Note: though Pygments does support a `String.Doc token type `_, # it doesn't truly identify docstring; from Pygments 2.1.3, # ``pygments.lexers.python.PythonLexer``: # # .. code-block:: Python3 # :linenos: # :lineno-start: 55 # # (r'^(\s*)([rRuU]{,2}"""(?:.|\n)*?""")', bygroups(Text, String.Doc)), # (r"^(\s*)([rRuU]{,2}'''(?:.|\n)*?''')", bygroups(Text, String.Doc)), # # which means that ``String.Doc`` is simply *ANY* triple-quoted string. # Looking at other lexers, ``String.Doc`` often refers to a # specially-formatted comment, not a docstring. From # ``pygments.lexers.javascript``: # # .. code-block:: Javascript # :linenos: # :lineno-start: 591 # # (r'//.*?\n', Comment.Single), # (r'/\*\*!.*?\*/', String.Doc), # (r'/\*.*?\*/', Comment.Multiline), # # So, the ``String.Doc`` token can't be used in any meaningful way by # CodeToRest to identify docstrings. Per Wikipedia's `docstrings article # `_, only three languages support # this feature. So, we'll need a language-specific approach. For Python, # `PEP 0257 `_ provides the # syntax and even some code to correctly remove docstring indentation. The # `ast `_ module provides # routines which parse a given Python string without executing it, which # looks like a better way to go that evaluating arbitrary Python then # looking at its ``__doc__`` attributes. On approach would be to find # docstrings, then convert them into comments before the lexer was run. # However, playing around with the following, it looks like lineno gives # the last line of the string, which might make fixing it harder. See also # http://bugs.python.org/issue16806. Some test code: # # .. code-block:: Python3 # :linenos: # # for _ in ast.walk(ast.parse(open('tst.py').read())): # try: # d = ast.get_docstring(_) # if d: # print('Patch docstring ending at line {}:\n{}\n' # '====================\n'.format(_.body[0].lineno, d)) # except (AttributeError, TypeError): # pass # # Perhaps the approach would be to scan with ast, then see if the line # number matches the ending line number of a string, and if so convert the # string into a comment. Trickiness: Python will merge strings; consider # the following: # # .. code-block:: pycon # :linenos: # # >>> def foo(): # ... ("""A comment.""" \ # ... ' More.' # ... " And more.") # ... pass # ... # >>> print(foo.__doc__) # A comment. More. And more. # # It's probabaly best not to support this case. Unfortunately, AST reports # this as a single string, rather than as a list of several elements. # The approach: make sure the docstring found by ast is in the text of a # Pygments string token. If so, replace the string token by a block # comment, whose contents come from ``inspect.cleandoc`` of the docstring. return _GROUP.other # # # Step #3 of lexer_to_rest_ # ------------------------- # Given an iterable of groups, break them into lists based on newlines. The list # consists of (group, comment_leading_whitespace_length, string) tuples. def _gather_groups_on_newlines( # An iterable of (group, string) pairs provided by # ``group_lexer_tokens``. iter_grouped, # .. _comment_delim_info: # # An element of COMMENT_DELIMITER_INFO for the language being classified. comment_delim_info): # Keep a list of (group, string) tuples we're accumulating. l = [] # Keep track of the length of whitespace at the beginning of the body and # end portions of a block comment. ws_len = 0 # Accumulate until we find a newline, then yield that. for group, string in iter_grouped: _debug_print('group = {}, string = {}\n'.format(group, [string])) # A given group (such as a block string) may extend across multiple # newlines. Split these groups apart first. splitlines = string.splitlines(True) # Look for block comments spread across multiple lines and label # them correctly. if len(splitlines) > 1 and group == _GROUP.block_comment: group = _GROUP.block_comment_start # Look for an indented multi-line comment block. First, determine # what the indent must be: (column in which comment starts) + # (length of comment delimiter) + (1 space). ws_len = comment_delim_info[1] + 1 for group_, ws_len_, string_ in l: ws_len += len(string_) # Determine the indent style (all spaces, or spaces followed by a # character, typically ``*``). If it's not spaces only, it must # be spaces followed by a delimiter. # # First, get the last character of the block comment delimiter. last_delim_char = string[comment_delim_info[1] - 1] if _is_space_indented_line(splitlines[1], ws_len, last_delim_char, len(splitlines) == 1, comment_delim_info): is_indented_line = _is_space_indented_line else: is_indented_line = _is_delim_indented_line # Look at the second and following lines to see if their indent is # consistent. for i, line in enumerate(splitlines[1:]): if not is_indented_line(line, ws_len, last_delim_char, len(splitlines) - 2 == i, comment_delim_info): # It's inconsistent. Set ws_len to 0 to signal that this # isn't an indented block comment. ws_len = 0 break for index, split_str in enumerate(splitlines): # Accumulate results. l.append( (group, ws_len, split_str) ) # For block comments, move from a start to a body group. if group == _GROUP.block_comment_start: group = _GROUP.block_comment_body # If the next line is the last line, update the block # group. is_next_to_last_line = index == len(splitlines) - 2 if (is_next_to_last_line and group == _GROUP.block_comment_body): group = _GROUP.block_comment_end # Yield when we find a newline, then clear our accumulator. if split_str.endswith('\n'): yield l l = [] # We've output a group; reset the ws_len to 0 in case the group just # output was a multi-line block comment with ws_len > 0. ws_len = 0 # Output final group, if one is still accumulating. if l: yield l # # # Block comment indentation processing # ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ # # #. A single line: ``/* comment */``. No special handling needed. # #. Multiple lines, indented with spaces. For example: # # .. code-block:: c # :linenos: # # Column: 1 # 1234567890234567 # # /* A multi- # # line # WRONG INDENTATION # string # */ # # In the code above, the text of the comment begins at column 6 of line 4, # with the letter A. Therefore, line 7 lacks the necessary 6-space indent, # so that no indententation will be removed from this comment. # # If line 6 was indented properly, the resulting reST would be: # # .. code-block:: rest # # ...rest to indent the left margin of the following text 2 spaces... # # A multi- # # line # RIGHT INDENTATION # string # # ...rest to end the left margin indent... # # Note that the first 5 characters were stripped off each line, leaving only # line 8 indented (preserving its indentation relative to the comment's # indentation). Some special cases in doing this processing: # # * Line 5 may contain less than the expected 5 space indent: it could be # only a newline. This must be supported with a special case. # * The comment closing (line 9) contains just 3 spaces; this is allowed. # If there are non-space characters before the closing comment delimiter, # they must not occur before column 6. For example, # # .. code-block:: c # # /* A multi- # line comment */ # # and # # .. code-block:: c # # /* A multi- # line comment # */ # # have consistent indentation. In particular, the last line of a multi-line # comment may contain zero or more whitespace chararacters followed by the # closing block comment delimiter. However, # # .. code-block:: c # # /* A multi- # line comment */ # # is not sufficiently indented to qualify for indentation removal. # # So, to recognize: # def _is_space_indented_line( # A line from a multi-line comment to examine. line, # The expected indent to check for; a length, in characters. indent_len, # Placeholder for delimiter expected near the end of an indent (one # character). Not used by this function, but this function must take the # same parameters as is_delim_indented_line_. delim, # True if this is the last line of a multi-line comment. is_last, # See comment_delim_info_. comment_delim_info): # A line containing only whitespace is always considered valid. if line.isspace(): return True # A line beginning with ws_len spaces has the expected indent. if len(line) > indent_len and line[:indent_len].isspace(): return True # The closing delimiter will always be followed by a newline, hence the - 1 # factor. line_except_closing_delim = line[:-comment_delim_info[2] - 1] # Last line: zero or more whitespaces followed by the closing block comment # delimiter is valid. Since ``''.isspace() == False``, check for this case # and consider it true. if (is_last and (not line_except_closing_delim or line_except_closing_delim.isspace())): return True # No other correctly indented cases. return False # # # (continuing from the list above...) # # #. Multiple lines, indented with spaces followed by a delimiter. For example: # # .. code-block:: c # :linenos: # # Column: 1 # 1234567890123456789 # /* Multi- # * line # * # *WRONG INDENTATION # * WRONG INDENTATION # */ # # The rules are similar to the previous case (indents with space alone). # However, there is one special case: line 5 doesn't require a space after # the asterisk; a newline is acceptable. If the indentation is corrected, the # result is: # # .. code-block:: rest # # ...rest to indent the left margin of the following text 2 spaces... # # Multi- # line # # RIGHT INDENTATION # RIGHT INDENTATION # # ...rest to end left margin indent... # # So, to recognize: # # .. _is_delim_indented_line: # def _is_delim_indented_line( # A line from a multi-line comment to examine. line, # The expected indent to check for; a length, in characters. indent_len, # Delimiter expected near the end of an indent (one character). delim, # True if this is the last line of a multi-line comment. is_last, # See comment_delim_info_. comment_delim_info): # A line the correct number of spaces, followed by a delimiter then either # a space or a newline is correctly indented. if (len(line) >= indent_len and line[:indent_len - 2].isspace() and line[indent_len - 2] == delim and line[indent_len - 1] in '\n '): return True # Last line possibility: indent_len - 2 spaces followed by the delimiter # is a valid indent. For example, an indent of 3 begins with ``/* comment`` # and can end with ``_*/``, a total of (indent_len == 3) - (2 spaces # that are usually a * followed by a space) + (closing delim ``*/`` length # of 2 chars) == 3. if ( is_last and len(line) == indent_len - 2 + comment_delim_info[2] and line[:indent_len - 2].isspace() ): return True # No other correctly indented cases. return False # # # Step #4 of lexer_to_rest_ # ------------------------- # Classify the output of ``gather_groups_on_newlines`` into either a code or # comment with n leading whitespace types. Remove all comment characters. # # .. Note:: # # This is a destructive edit, instead of a classification. To make this invertible, it needs to be non-destructive. The idea: # # * Output s, the entire line as a string, if it's not a reST comment. Currently, it outputs -1, s. # * Output whitespace characters or '', opening comment delimiter and space, comment text, closing comment delimiter or '', newline). Currently, it outputs len(leading whitespace characters), comment text + newline. def _classify_groups( # An iterable of [(group1, string1_no_newline), (group2, string2_no_newline), # ..., (groupN, stringN_ending_newline)], produced by # ``gather_groups_on_newlines``. iter_gathered_groups, # See comment_delim_info_. comment_delim_info): # Keep track of block comment state. is_block_rest_comment = False # Walk through groups. for l in iter_gathered_groups: _debug_print('[(group, ws_len, string), ...] = {}\n'.format(l)) if _is_rest_comment(l, is_block_rest_comment, comment_delim_info): first_group, first_ws_len, first_string = l[0] # The type = # of leading whitespace characters, or 0 if none. if first_group == _GROUP.whitespace: # Encode this whitespace in the type, then drop it. type_ = len(first_string) l.pop(0) first_group, first_ws_len, first_string = l[0] # For body or end block comments, use the indent set by at the # beginning of the comment. Otherwise, there is no indent, so # set it to 0. elif not _is_block_body_or_end(first_group): type_ = 0 # Update the block reST state. if first_group == _GROUP.block_comment_start: is_block_rest_comment = True # Strip all comment characters off the strings and combine them. string = ''.join([_remove_comment_delim(group, string, comment_delim_info) for group, ws_len, string in l]) # Remove the inital space character from the first comment, # or ws_len chars from body or end comments. if _is_block_body_or_end(first_group): # Some of the body or end block lines may be just whitespace. # Don't strip these: the line may be too short or we might # remove a newline. Or, this might be inconsistent indentation # in which ws_len == 0, in which case we should also strip # nothing. if not string.isspace() and first_ws_len > 0: # The first ws_len - 1 characters should be stripped. string = string[first_ws_len - 1:] # The last character, if it's a space, should also be # stripped. if string[0] == ' ': string = string[1:] # A comment of ``//\n`` qualifies as a reST comment, but should # not have the ``\n`` stripped off. Avoid this case. Otherwise, # handle the more typical ``// comment`` case, in which the space # after the comment delimiter should be removed. elif len(string) > 0 and string[0] == ' ': string = string[1:] # Everything else is considered code. else: type_ = -1 string = ''.join([string for group, ws_len, string in l]) is_block_rest_comment = False yield type_, string # # # Supporting routines # ^^^^^^^^^^^^^^^^^^^ # Given a (group, string) tuple, return the string with comment delimiter # removed if it is a comment, or just the string if it's not a comment. def _remove_comment_delim( # The group this string was classified into. group, # The string corresponding to this group. string, # See comment_delim_info_. comment_delim_info): # Number of characters in a single-line comment delimiter. (len_inline_comment_delim, # Number of characters in an opening block comment. len_opening_block_comment_delim, # Number of characters in an closing block comment. len_closing_block_comment_delim) = comment_delim_info if group == _GROUP.inline_comment: return string[len_inline_comment_delim:] if group == _GROUP.block_comment: return string[ len_opening_block_comment_delim: -len_closing_block_comment_delim] if group == _GROUP.block_comment_start: return string[len_opening_block_comment_delim:] if group == _GROUP.block_comment_end: return string[:-len_closing_block_comment_delim] else: return string # Return a string with the given delimiter removed from its beginning. # # .. note:: # # This code isn't used yet -- it's for a rewrite which will support multiple # delimiters. def _remove_beginning_comment_delim( # Either the number of characters in the beginning delimiter, or a tuple of # strings which give all valie beginning comment delimiters. beginning_comment_delim_seq, # The string which start with the delimiter to be removed. string): # Loop through all delimiters. for bcd in beginning_comment_delim_seq: # If we find one at the beginning of the string, strip it off. if string.startswith(bcd): return string[len(bcd):] # Not found -- panic. assert False # Determine if the given line is a comment to be interpreted by reST. # Supports ``remove_comment_chars``, ``classify_groups``. def _is_rest_comment( # A sequence of (group, string) representing a single line. line_list, # True if this line contains the body or end of a block comment # that will be interpreted by reST. is_block_rest_comment, # See lexer_. lexer): # See if there is any _GROUP.other in this line. If so, it's not a reST # comment. group_tuple, ws_len_tuple, string_tuple = list(zip(*line_list)) if _GROUP.other in group_tuple: return False # If there's no comments (meaning the entire line is whitespace), it's not a # reST comment. if group_tuple == (_GROUP.whitespace, ): return False # Find the first comment. There may be whitespace preceeding it, so select # the correct index. first_comment_index = 1 if group_tuple[0] == _GROUP.whitespace else 0 first_group = group_tuple[first_comment_index] first_string = string_tuple[first_comment_index] # The cases are: # # 1. ``// comment, //\n, #`` -> reST comment. Note that in some languages # (Python is one example), the newline isn't included in the comment. # 2. ``//comment`` -> not a reST comment. # 3. ``/* comment, /*\n`` -> reST comment # 4. Any block comment body or end for which its block start was a reST # comment -> reST comment. # 5. ``/**/`` -> a reST comment. (I could see this either as reST or not; # because it was simpler, I picked reST.) # # Begin by checking case #4 above. if is_block_rest_comment and _is_block_body_or_end(first_group): return True # To check the other cases, first remove the comment delimiter so we can # examine the next character following the delimiter. first_comment_text = _remove_comment_delim(first_group, first_string, lexer) first_char_is_rest = ( (len(first_comment_text) > 0 and first_comment_text[0] in (' ', '\n')) or len(first_comment_text) == 0 ) if first_char_is_rest and not _is_block_body_or_end(first_group): return True return False # Determine if this group is either a block comment body or a block comment end. def _is_block_body_or_end(group): return group in (_GROUP.block_comment_body, _GROUP.block_comment_end) # # # .. _Step_5: # # Step #5 of lexer_to_rest_ # ------------------------- # When creating reST block containing code or comments, two difficulies # arise: preserving the indentation # of both source code and comments; and preserving empty lines of code at the # beginning or end of a block of code. In the following examples, examine both # the source code and the resulting HTML to get the full picture, since the text # below is (after all) in reST, and will be therefore be transformed to HTML. # # First, consider a method to preserve empty lines of code. Consider, for # example, the following: # # +--------------------------+-------------------------+-----------------------------------+ # + Python source + Translated to reST + Translated to (simplified) HTML | # +==========================+=========================+===================================+ # | .. code-block:: Python3 | Do something :: | .. code-block:: html | # | | | | # | # Do something | foo = 1 |

Do something:

| # | foo = 1 | |
foo = 1                     |
# |                          | Do something else ::    |  
| # | # Do something else | |

Do something else:

| # | bar = 2 | bar = 2 |
bar = 2                     |
# |                          |                         |  
| # +--------------------------+-------------------------+-----------------------------------+ # # In this example, the blank line is lost, since reST allows the literal bock # containing ``foo = 1`` to end with multiple blank lines; the resulting HTML # contains only one newline between each of these lines. To solve this, some CSS # hackery helps tighten up spacing between lines. In addition, this routine adds # a one-line fence, removed during processing, at the beginning and end of each # code block to preserve blank lines. The new translation becomes: # # +--------------------------+-------------------------+-----------------------------------+ # + Python source + Translated to reST + Translated to (simplified) HTML | # +==========================+=========================+===================================+ # | .. code-block:: Python3 | Do something | .. code-block:: html | # | | | | # | # Do something | .. fenced-code:: |

Do something:

| # | foo = 1 | |
foo = 1                     |
# |                          |    Beginning fence      |                                   |
# |  # Do something else     |    foo = 1              |  
| # | bar = 2 | |

Do something else:

| # | | Ending fence |
bar = 2                     |
# |                          |                         |  
| # | | Do something else | | # | | | | # | | .. fenced-code:: | | # | | | | # | | Beginning fence | | # | | bar = 2 | | # | | Ending fence | | # | | | | # | | | | # +--------------------------+-------------------------+-----------------------------------+ # # Preserving indentation # ^^^^^^^^^^^^^^^^^^^^^^ # The same fence approach also preserves indentation. Without the fences, # indendentation is consumed by the reST parser: # # +--------------------------+-------------------------+-----------------------------------+ # + Python source + Translated to reST + Translated to (simplified) HTML | # +==========================+=========================+===================================+ # | .. code-block:: Python3 | One space indent :: | .. code-block:: html | # | | | | # | # One space indent | foo = 1 |

One space indent

| # | foo = 1 | |
foo = 1                     |
# |  # No indent             | No indent ::            |  
| # | bar = 2 | |

No indent

| # | | bar = 2 |
bar = 1                     |
# |                          |                         |  
| # +--------------------------+-------------------------+-----------------------------------+ # # With fences added: # # +--------------------------+-------------------------+-----------------------------------+ # + Python source + Translated to reST + Translated to (simplified) HTML | # +==========================+=========================+===================================+ # | .. code-block:: Python3 | One space indent | .. code-block:: html | # | | | | # | # One space indent | .. fenced-code:: |

One space indent

| # | foo = 1 | |
 foo = 1                    |
# |  # No indent             |    Beginning fence      |  
| # | bar = 2 | foo = 1 |

No indent

| # | | Ending fence |
bar = 1                     |
# |                          |                         |  
| # | | No indent | | # | | | | # | | .. fenced-code:: | | # | | | | # | | Beginning fence | | # | | bar = 1 | | # | | Ending fence | | # +--------------------------+-------------------------+-----------------------------------+ # # Preserving indentation for comments is more difficult. Blockquotes in reST are # defined by common indentation, so that any number of (common) spaces define a # blockquote. So, the distance between a zero and two-space indent is the same # as the distance between a two-space and a three-space indent; we need the # second indent to be half the size of the first. # # +--------------------------+-------------------------+-----------------------------------+ # + Python source + Translated to reST + Translated to (simplified) HTML | # +==========================+=========================+===================================+ # | .. code-block:: Python3 | No indent | .. code-block:: html | # | | | | # | # No indent | Two space indent |

No indent

| # | # Two space indent | |
Two space indent | # | # Three space indent | Three space indent |
Three space | # | | | indent | # | | |
| # | | |
| # +--------------------------+-------------------------+-----------------------------------+ # # To fix this, the `raw directive # `_ # is used to insert a pair of ``
`` and ``
`` HTML elements which set # the left margin of indented text based on how many spaces (0.5 em = 1 space). # # +--------------------------+-------------------------+-----------------------------------+ # + Python source + Translated to reST | Translated to (simplified) HTML | # +==========================+=========================+===================================+ # | .. code-block:: Python3 | No indent | .. code-block:: html | # | | | | # | # No indent | .. raw:: html |

No indent

| # | # Two space indent | |
| # | # Three space indent |
Two space indent

| # | | "margin-left:1.0em;"> |
| # | | |
| # | | Two space indent |

Three space indent

| # | | |
| # | | .. raw:: html | | # | | | | # | |
| | # | | | | # | | Three space indent | | # | | | | # | | .. raw:: html | | # | | | | # | |
| | # +--------------------------+-------------------------+-----------------------------------+ # Following either a fenced code block or a raw block, care must be taken to # separate these blocks' content from indented comments which follow them. For # example, the following code: # # +--------------------------+-------------------------+-----------------------------------+ # + Python source + Translated to reST | Translated to (simplified) HTML | # +==========================+=========================+===================================+ # | .. code-block:: Python3 | .. fenced-code:: | .. code-block:: html | # | | | | # | def foo(): | Beginning fence |
def foo():                  |
# |  #      Indented comment |     def foo():          |  Ending fence                     |
# |                          |     Ending fence        |                                   |
# |                          |                         |                                   |
# |                          |        Indented comment |                                   |
# +--------------------------+-------------------------+-----------------------------------+
#
# Notice that the ``Ending fence`` ends up in the resulting HTML! To fix this,
# simply add an unindented `reST comment
# `_ after a block.
#
# +--------------------------+-------------------------+-----------------------------------+
# + Python source            + Translated to reST      | Translated to (simplified) HTML   |
# +==========================+=========================+===================================+
# | .. code-block:: Python3  |  .. fenced-code::       | .. code-block:: html              |
# |                          |                         |                                   |
# |  def foo():              |     Beginning fence     |  
def foo():                  |
# |  #      Indented comment |     def foo():          |  
| # | | Ending fence |
| # | | | Indented comment | # | | .. |
| # | | | | # | | Indented comment | | # +--------------------------+-------------------------+-----------------------------------+ # # Mixed code and comments # ^^^^^^^^^^^^^^^^^^^^^^^ # Note that mixing code and comments is hard: reST will still apply some of # its parsing rules to an inline code block or inline literal, meaning # that leading or trailing spaces and backticks will not be preserved, # instead parsing incorrectly. For example :: # # :code:` Testing ` # # renders incorrectly. So, mixed lines are simply translated as code, meaning # reST markup can't be applied to the comments. # # .. _summary_and_implementation: # # Summary and implementation # ^^^^^^^^^^^^^^^^^^^^^^^^^^ # This boils down to the following basic rules: # # #. Code blocks must be preceeded and followed by a removed marker (fences). # # #. Comments must be preeceded and followed by reST which sets the left # margin based on the number of spaces before the comment. # # #. Both must be followed by an empty, unindented `reST comment`_. # # .. _generate_rest: # # Generate reST from the classified code. To do this, create a state machine, # where current_type defines the state. When the state changes, exit the # previous state (output a closing fence or closing ``
``, then enter the # new state (output a fenced code block or an opening ``
``. def _generate_rest( # An iterable of (type, string) pairs, one per line. classified_lines, # .. _out_file: # # A file-like output to which the reST text is written. out_file): # Keep track of the current type. Begin with a 0-indent comment. current_type = -2 # Keep track of the current line number. line = 1 for type_, string in classified_lines: _debug_print('type_ = {}, line = {}, string = {}\n'.format(type_, line, [string])) # See if there's a change in state. if current_type != type_: # Exit the current state. _exit_state(current_type, out_file) # Enter the new state. # # Code state: emit the beginning of a fenced block. if type_ == -1: out_file.write('\n.. fenced-code::\n\n Beginning fence\n') # Comment state: emit an opening indent for non-zero indents. else: # Add an indent if needed. if type_ > 0: out_file.write('\n.. raw:: html\n\n' '
\n\n'.format(0.5*type_)) # Specify the line number in the source, so that errors will be # accurately reported. This isn't necessary in code blocks, # since errors can't occur. # # After this directive, the following text may be indented, # which would make it a part of the ``set-line`` directive! So, # include an empty comment to terminate the ``set-line``, making # any following indents a separate syntatical element. See the # end of `reST comment syntax `_ # for more discussion. out_file.write('\n.. set-line:: {}\n\n..\n\n'.format(line - 4)) # Output string based on state. All code needs an inital space to # place it inside the fenced-code block. if type_ == -1: out_file.write(' ') out_file.write(string) # Update the state. current_type = type_ line += 1 # When done, exit the last state. _exit_state(current_type, out_file) # # # Supporting routines # """"""""""""""""""" # Output text produce when exiting a state. Supports generate_rest_ above. def _exit_state( # The type (classification) of the last line. type_, # See out_file_. out_file): # Code state: emit an ending fence. if type_ == -1: out_file.write(' Ending fence\n\n..\n\n') # Comment state: emit a closing indent. elif type_ > 0: out_file.write('\n.. raw:: html\n\n
\n\n..\n\n') # Initial state. Nothing needed. else: pass # # # Supporting reST directives # """""""""""""""""""""""""" # Create a fenced code block: the first and last lines are presumed to be # fences, which keep the parser from discarding whitespace. Drop these, then # treat everything else as code. # # See the `directive docs # `_ for more # information. class _FencedCodeBlock(CodeBlock): def run(self): # The content must contain at least two lines (the fences). if len(self.content) < 2: raise self.error('Fenced code block must contain at least two lines.') # Remove the fences. self.content = self.content[1:-1] # # By default, the Pygments `stripnl # `_ option is True, causing Pygments # to drop any empty lines. The reST parser converts a line containing # only spaces to an empty line, which will then be stripped by Pygments # if these are leading or trailing newlines. So, add a space back in to # keep these lines from being dropped. # # So, first add spaces from the beginning of the lines until we reach # the first non-blank line. processedAllContent = True for i, content in enumerate(self.content): if content: processedAllContent = False break self.content[i] = ' ' # If we've seen all the content, then don't do it again -- we'd be # adding unnecessary spaces. Otherwise, walk from the end of the content # backwards, adding spaces until the first non-blank line. if not processedAllContent: for i, _ in enumerate(self.content): # Recall Python indexing: while 0 is the first elemment in a # list, -1 is the last element, so offset all indices by -1. if self.content[-i - 1]: break self.content[-i - 1] = ' ' # Now, process the resulting contents as a code block. nodeList = CodeBlock.run(self) # Sphinx fix: if the current `highlight # `_ language is ``python``, # "Normal Python code is only highlighted if it is parseable" (quoted # from the previous link). This means code snippets, such as # ``def foo():`` won't be highlighted: Python wants ``def foo(): pass``, # for example. To get around this, setting the ``highlight_args`` option # "force"=True skips the parsing. I found this in # ``Sphinx.highlighting.highlight_block`` (see the ``force`` argument) # and in ``Sphinx.writers.html.HTMLWriter.visit_literal_block``, where # the ``code-block`` directive (which supports fragments of code, not # just parsable code) has ``highlight_args['force'] = True`` set. This # should be ignored by docutils, so it's done for both Sphinx and # docutils. **Note:** This is based on examining Sphinx 1.3.1 source # code. # # Note that the nodeList returned by the CodeBlock directive contains # only a single ``literal_block`` node. The setting should be applied to # it. nodeList[0]['highlight_args'] = {'force' : True} return nodeList # This directive allows changing the line number at which errors will be # reported. ``.. set-line:: 10`` makes the current line report as line 10, # regardless of its actual location in the file. class _SetLine(Directive): required_arguments = 1 optional_arguments = 0 final_argument_whitespace = False option_spec = {} has_content = False def run(self): line = int(self.arguments[0]) # The ``input_lines`` class (see docutils.statemachine.ViewList) # maintains two lists, ``data`` and ``items``. ``data`` is a list of # strings, one per line, of reST to process. ``items`` is a list of # ``(source, offset)`` giving the source of each line and the offset of # each line from the beginning of its source. Modifying the offset will # change the reported error location. il = self.state_machine.input_lines # However, directly modifying ``input_lines`` produces no effect: the # root ``input_lines`` must be modified. Find it. while il.parent: il = il.parent # Now, determine the index of this directive's line in the parent # ``input_offset`` class. current_line = (self.state_machine.line_offset + self.state_machine.input_offset) # Walk from this line to the end of the file, rewriting the offset (that # is, the effective line number). items = il.items for i in range(current_line, len(items)): (source, offset) = items[i] items[i] = (source, line) line += 1 # This directive create no nodes. return [] # Register the new fenced code block directive with docutils. directives.register_directive('fenced-code', _FencedCodeBlock) directives.register_directive('set-line', _SetLine)