# -*- coding: utf-8 -*- 

# lp4all: literate programming embedded in source code as wiki comments
# Copyright (C) 2006 Jean-Marie Favreau, Frédéric Lehobey, David Mentré
#                    and Thomas Petazzoni
# 
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
# 
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
# 
# You should have received a copy of the GNU General Public License along
# with this program; if not, write to the Free Software Foundation, Inc.,
# 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.

Parsing of source code files

This module allows to parse source code files, and to split each file into a list of blocks. Each block is represented by a Block object, which can be either Code or Comment objects.


import sys
import os.path
import re
from iface import *

The different kinds of match for a comment, used by the LineCommentStyle and BlockCommentStyle classes. See below.

class MatchStart:
    def __init__(self, content, indentation):
        self.content = content
        self.indentation = indentation

class MatchEnd:
    def __init__(self, content, iscomment = True):
        self.content   = content
        self.iscomment = iscomment

class MatchStartEnd:
    def __init__(self, content, indentation):
        self.content = content
        self.indentation = indentation

class MatchLineComment:
    def __init__(self, content):
        self.content = content

The following classes LineCommentStyle, BlockCommentStyle and UnknownCommentStyle are used to define the comment styles of the various programming languages supported by this parser. See here for their usage.

Each of these class must implement two methods findstart() and findnext(). These methods must return one of the four following objects:

  • a MatchStart object if the beginning of a comment has been found
  • a MatchEnd object if the end of a comment has been found
  • a MatchStartEnd object if a single line comment has been found
  • a MatchLineComment if a new line has been found for an already started comment

Represent a line-style comment, like // in C++

class LineCommentStyle:
    def __init__(self, commenttag, commentstart):
        self.commentstart = commentstart
        self.regexp_start = re.compile("^(?P<spaces>[ \t]*)" + re.escape(commentstart)
                                       + re.escape(commenttag) + "(?P<content>.*)")
        self.regexp_continue = re.compile("^[ \t]*" + re.escape(commentstart)
                                          + "(" + re.escape(commenttag) + ")?(?P<content>.*)")

    def __str__(self):
        return "[LineCommentStyle] Commmentstart: %s" % self.commentstart

    def findstart(self, line, is_in_comment):
        match_obj = self.regexp_start.match(line)
        if match_obj:
            return MatchStart(match_obj.group('content') + "\n",
                              len(match_obj.group('spaces')))
        return None

    def findnext(self, line, is_in_comment):
        match_obj = self.regexp_continue.match(line)
        if match_obj and is_in_comment:
            return MatchLineComment(match_obj.group('content') + "\n")
        if is_in_comment:
            return MatchEnd(line, False)
        return None

Represent a block-style comment, like /* ... */ in C

class BlockCommentStyle:
    def __init__(self, commenttag, commentstart, commentend, magicsymbol):
        self.commentstart = commentstart
        self.commentend = commentend
        self.regexp_start = re.compile("^(?P<spaces>[ \t]*)" + re.escape(commentstart)
                                       + re.escape(commenttag) + "(?P<content>.*)")
        self.regexp_end   = re.compile("(?P<content>.*)" + re.escape(commentend))
        self.regexp_start_end = re.compile("^(?P<spaces>[ \t]*)" + re.escape(commentstart)
                                           + re.escape(commenttag) + "(?P<content>.*)"
                                           + re.escape(commentend))
        self.regexp_star_fix = re.compile("^[ \t]*" + re.escape(magicsymbol)
                                          + "(?P<content>.*)")

    def __str__(self):
        return "[BlockCommentStyle] Commmentstart: %s Commentend: %s" % \
               (self.commentstart, self.commentend)

    def findstart(self, line, is_in_comment):
        match = self.regexp_start_end.match(line)
        if match:
            return MatchStartEnd(match.group('content') + "\n", len(match.group('spaces')))

        match = self.regexp_start.match(line)
        if match:
            return MatchStart(match.group('content') + "\n", len(match.group('spaces')))

        return None

    def findnext(self, line, is_in_comment):
        match = self.regexp_end.match(line)
        if match:
            return MatchEnd(match.group('content') + "\n")

        if is_in_comment:
            match = self.regexp_star_fix.match(line)
            if match:
                return MatchLineComment(match.group('content') + "\n")
            else:
                return MatchLineComment(line)

        return None

Unknown comment style, used for unrecognized language

class UnknownCommentStyle:
    def __str__(self):
        return "[UnknownCommentStyle]"

    def findstart(self, line, is_in_comment):
        return None

    def findnext(self, line, is_in_comment):
        return None

class FileParser:

Build the FileParser. A new language can be added through the self.languagesExtension hashtable or the self.languagesShebangs list. These two objects are built at run time because they depend on commenttag (the marker for Wiki comments), which is user-configurable.

In languagesExtension, we associate an extension to a language description. In languagesShebangs, we associate a shebang to a language description.

A language description is a language name (as recognized by GNU Source Highlight) and a list of the comment styles of the language.

 '.c'   : ("cpp",  [BlockCommentStyle(commenttag, "/*", "*/", "*"),
                    LineCommentStyle(commenttag, "//")])
 

This example shows that we associate the '.c' extension to the language named cpp. We also associate it to the list of supported comment types in the C language. The C language supports block-style comments delimited by /* and */ as well as line-style comments that start with //.

    def __init__(self, commenttag):
        self.commenttag = commenttag
        self.languagesExtension = {
            '.lp4all' : ("unknown", [UnknownCommentStyle()]),
            '.ml'  : ("caml", [BlockCommentStyle(commenttag, "(*", "*)", "*")]),
            '.c'   : ("cpp",  [BlockCommentStyle(commenttag, "/*", "*/", "*"),
                               LineCommentStyle(commenttag, "//")]),
            '.h'   : ("cpp",  [BlockCommentStyle(commenttag, "/*", "*/", "*"),
                               LineCommentStyle(commenttag, "//")]),
            '.cpp' : ("cpp",  [BlockCommentStyle(commenttag, "/*", "*/", "*"),
                               LineCommentStyle(commenttag, "//")]),
            '.cxx' : ("cpp",  [BlockCommentStyle(commenttag, "/*", "*/", "*"),
                               LineCommentStyle(commenttag, "//")]),
            '.py'  : ("python", [LineCommentStyle(commenttag, "#")]),
            '.css'  : ("css", [BlockCommentStyle(commenttag, "/*", "*/", "*")])
            }

        # Shell scripts are not recognized by source-highlight, so for
        # the moment, we set the language name to unknown
        self.languagesShebangs = [
            ("^#!.*python", ("python", [LineCommentStyle(commenttag, "#")])),
            ("^#!.*sh",     ("unknown",  [LineCommentStyle(commenttag, "#")])),
            ]

Find the language description (name and comment styles) using the languagesExtension and languagesShebangs lists. Warn the user if the type is not recognized.

    def guessLanguage(self, filename, first_line):
        # shebang line
        for (shebang, retval) in self.languagesShebangs:
            if re.compile(shebang).search(first_line) != None:
                return retval
        # file name
        if filename == "Makefile":
            return ("make", [LineCommentStyle(self.commenttag, "#")])
        # file suffix
        (root, ext) = os.path.splitext(filename)

        if ext == "":
            print "WARNING: unknown file type for '%s'" % filename
            return ("unknown", [UnknownCommentStyle()])

        try:
            return self.languagesExtension[ext]
        except KeyError:
            print "WARNING: unknown file extension ('%s' in '%s')" % (ext, filename)
            return ("unknown", [UnknownCommentStyle()])

Parse a filename and return a list of objects Code or Comment

    def parse(self, full_filename):
        lineno = 0
        blocks = []
        is_in_comment = False
        linestart = lineno
        text = ""
        wikiblock_indent = 0

        file = open(full_filename, "r")
        file_content = file.readlines()
        file.close()

        (language, comment_styles) =  \
                   self.guessLanguage(os.path.basename(full_filename),
                                      file_content[0])

This is the core of the parsing function. It is implemented using a 2-states state machine. The two states are looking for the beginning of a comment, and looking for sequel or end of the current comment. The state is stored in curstyle. When set to None, we're in the first state. Otherwise, we're in the second state and curstyle contains a reference to the current comment style.

        curstyle = None
        for line in file_content:
            match = None

Try to find if the current line is the beginning of a comment that matches our special tag. We try all styles of comments.

            if not curstyle:
                for style in comment_styles:
                    match = style.findstart(line, is_in_comment=False)
                    if match:

Start of comment that spans over multiple lines. The current style is saved in order to match the next lines of the comment.

                        if match.__class__ == MatchStart:
                            assert curstyle is None
                            if text != "":
                                blocks.append(Code(full_filename, linestart,
                                                   lineno - 1, text, language))
                            text = match.content
                            linestart = lineno
                            curstyle = style
                            wikiblock_indentation = match.indentation

Start (and end) of a comment that is only on the current line

                        elif match.__class__ == MatchStartEnd:
                            assert curstyle is None
                            if text != "":
                                blocks.append(Code(full_filename, linestart,
                                                   lineno - 1, text, language))
                            blocks.append(Comment(match.indentation, full_filename, lineno,
                                                  lineno, match.content))
                            text = ""
                            linestart = lineno + 1

Once we found a matching style, do not try other styles

                        break

We're inside a comment, and we're trying to find the next lines of the comment

            else:
                match = style.findnext(line, is_in_comment=True)
                if match:
                    if match.__class__ == MatchEnd:

End of the comment has been matched. Two cases: the current line is part of the comment itself, or the current line is in fact part of the beginning of the next code block.

                        if match.iscomment:
                            text += match.content
                            blocks.append(Comment(wikiblock_indentation,
                                                  full_filename, linestart,
                                                  lineno, text))
                            text = ""
                            linestart = lineno + 1
                        else:
                            blocks.append(Comment(wikiblock_indentation,
                                                  full_filename, linestart,
                                                  lineno-1, text))
                            text = match.content
                            linestart = lineno
                        curstyle = None
                    elif match.__class__ == MatchLineComment:
                        # We matched one more line for the current
                        # comment
                        text += match.content

Nothing has been matched, we're in a code block.

            if not match:
                assert curstyle is None
                text += line

            lineno += 1

Finalize the last block

        if text != "":
            if curstyle:
                blocks.append(Comment(wikiblock_indentation,
                                      full_filename, linestart, lineno, text))
            else:
                blocks.append(Code(full_filename, linestart, lineno, text,
                                   language))

        return blocks