# -*- coding: utf-8 -*- # lp4all: literate programming embedded in source code as wiki comments # Copyright (C) 2006 Jean-Marie Favreau, Frédéric Lehobey, David Mentré # and Thomas Petazzoni # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License along # with this program; if not, write to the Free Software Foundation, Inc., # 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
import sys import os.path import re from iface import *
The different kinds of match for a comment, used by the
LineCommentStyle and BlockCommentStyle classes. See
below.
class MatchStart: def __init__(self, content, indentation): self.content = content self.indentation = indentation class MatchEnd: def __init__(self, content, iscomment = True): self.content = content self.iscomment = iscomment class MatchStartEnd: def __init__(self, content, indentation): self.content = content self.indentation = indentation class MatchLineComment: def __init__(self, content): self.content = content
The following classes LineCommentStyle, BlockCommentStyle
and UnknownCommentStyle are used to define the comment styles of
the various programming languages supported by this parser. See
here for their usage.
Each of these class must implement two methods findstart() and
findnext(). These methods must return one of the four following objects:
MatchStart object if the beginning of a comment has been
foundMatchEnd object if the end of a comment has been foundMatchStartEnd object if a single line comment has been
foundMatchLineComment if a new line has been found for an
already started comment Represent a line-style comment, like // in C++
class LineCommentStyle: def __init__(self, commenttag, commentstart): self.commentstart = commentstart self.regexp_start = re.compile("^(?P<spaces>[ \t]*)" + re.escape(commentstart) + re.escape(commenttag) + "(?P<content>.*)") self.regexp_continue = re.compile("^[ \t]*" + re.escape(commentstart) + "(" + re.escape(commenttag) + ")?(?P<content>.*)") def __str__(self): return "[LineCommentStyle] Commmentstart: %s" % self.commentstart def findstart(self, line, is_in_comment): match_obj = self.regexp_start.match(line) if match_obj: return MatchStart(match_obj.group('content') + "\n", len(match_obj.group('spaces'))) return None def findnext(self, line, is_in_comment): match_obj = self.regexp_continue.match(line) if match_obj and is_in_comment: return MatchLineComment(match_obj.group('content') + "\n") if is_in_comment: return MatchEnd(line, False) return None
Represent a block-style comment, like /* ... */ in C
class BlockCommentStyle: def __init__(self, commenttag, commentstart, commentend, magicsymbol): self.commentstart = commentstart self.commentend = commentend self.regexp_start = re.compile("^(?P<spaces>[ \t]*)" + re.escape(commentstart) + re.escape(commenttag) + "(?P<content>.*)") self.regexp_end = re.compile("(?P<content>.*)" + re.escape(commentend)) self.regexp_start_end = re.compile("^(?P<spaces>[ \t]*)" + re.escape(commentstart) + re.escape(commenttag) + "(?P<content>.*)" + re.escape(commentend)) self.regexp_star_fix = re.compile("^[ \t]*" + re.escape(magicsymbol) + "(?P<content>.*)") def __str__(self): return "[BlockCommentStyle] Commmentstart: %s Commentend: %s" % \ (self.commentstart, self.commentend) def findstart(self, line, is_in_comment): match = self.regexp_start_end.match(line) if match: return MatchStartEnd(match.group('content') + "\n", len(match.group('spaces'))) match = self.regexp_start.match(line) if match: return MatchStart(match.group('content') + "\n", len(match.group('spaces'))) return None def findnext(self, line, is_in_comment): match = self.regexp_end.match(line) if match: return MatchEnd(match.group('content') + "\n") if is_in_comment: match = self.regexp_star_fix.match(line) if match: return MatchLineComment(match.group('content') + "\n") else: return MatchLineComment(line) return None
Unknown comment style, used for unrecognized language
class UnknownCommentStyle: def __str__(self): return "[UnknownCommentStyle]" def findstart(self, line, is_in_comment): return None def findnext(self, line, is_in_comment): return None class FileParser:
Build the FileParser. A new language can be added through
the self.languagesExtension hashtable or the
self.languagesShebangs list. These two objects are built at
run time because they depend on commenttag (the marker for
Wiki comments), which is user-configurable.
In languagesExtension, we associate an extension to a
language description. In languagesShebangs, we associate a
shebang to a language description.
A language description is a language name (as recognized by GNU Source Highlight) and a list of the comment styles of the language.
'.c' : ("cpp", [BlockCommentStyle(commenttag, "/*", "*/", "*"),
LineCommentStyle(commenttag, "//")])
This example shows that we associate the '.c' extension to
the language named cpp. We also associate it to the list of
supported comment types in the C language. The C language
supports block-style comments delimited by /* and */ as
well as line-style comments that start with //.
def __init__(self, commenttag): self.commenttag = commenttag self.languagesExtension = { '.lp4all' : ("unknown", [UnknownCommentStyle()]), '.ml' : ("caml", [BlockCommentStyle(commenttag, "(*", "*)", "*")]), '.c' : ("cpp", [BlockCommentStyle(commenttag, "/*", "*/", "*"), LineCommentStyle(commenttag, "//")]), '.h' : ("cpp", [BlockCommentStyle(commenttag, "/*", "*/", "*"), LineCommentStyle(commenttag, "//")]), '.cpp' : ("cpp", [BlockCommentStyle(commenttag, "/*", "*/", "*"), LineCommentStyle(commenttag, "//")]), '.cxx' : ("cpp", [BlockCommentStyle(commenttag, "/*", "*/", "*"), LineCommentStyle(commenttag, "//")]), '.py' : ("python", [LineCommentStyle(commenttag, "#")]), '.css' : ("css", [BlockCommentStyle(commenttag, "/*", "*/", "*")]) } # Shell scripts are not recognized by source-highlight, so for # the moment, we set the language name to unknown self.languagesShebangs = [ ("^#!.*python", ("python", [LineCommentStyle(commenttag, "#")])), ("^#!.*sh", ("unknown", [LineCommentStyle(commenttag, "#")])), ]
Find the language description (name and comment styles) using
the languagesExtension and languagesShebangs lists. Warn
the user if the type is not recognized.
def guessLanguage(self, filename, first_line): # shebang line for (shebang, retval) in self.languagesShebangs: if re.compile(shebang).search(first_line) != None: return retval # file name if filename == "Makefile": return ("make", [LineCommentStyle(self.commenttag, "#")]) # file suffix (root, ext) = os.path.splitext(filename) if ext == "": print "WARNING: unknown file type for '%s'" % filename return ("unknown", [UnknownCommentStyle()]) try: return self.languagesExtension[ext] except KeyError: print "WARNING: unknown file extension ('%s' in '%s')" % (ext, filename) return ("unknown", [UnknownCommentStyle()])
Parse a filename and return a list of objects Code or
Comment
def parse(self, full_filename): lineno = 0 blocks = [] is_in_comment = False linestart = lineno text = "" wikiblock_indent = 0 file = open(full_filename, "r") file_content = file.readlines() file.close() (language, comment_styles) = \ self.guessLanguage(os.path.basename(full_filename), file_content[0])
This is the core of the parsing function. It is implemented
using a 2-states state machine. The two states are looking
for the beginning of a comment, and looking for sequel
or end of the current comment. The state is stored in
curstyle. When set to None, we're in the first
state. Otherwise, we're in the second state and curstyle
contains a reference to the current comment style.
curstyle = None for line in file_content: match = None
Try to find if the current line is the beginning of a comment that matches our special tag. We try all styles of comments.
if not curstyle: for style in comment_styles: match = style.findstart(line, is_in_comment=False) if match:
Start of comment that spans over multiple lines. The current style is saved in order to match the next lines of the comment.
if match.__class__ == MatchStart: assert curstyle is None if text != "": blocks.append(Code(full_filename, linestart, lineno - 1, text, language)) text = match.content linestart = lineno curstyle = style wikiblock_indentation = match.indentation
Start (and end) of a comment that is only on the current line
elif match.__class__ == MatchStartEnd: assert curstyle is None if text != "": blocks.append(Code(full_filename, linestart, lineno - 1, text, language)) blocks.append(Comment(match.indentation, full_filename, lineno, lineno, match.content)) text = "" linestart = lineno + 1
Once we found a matching style, do not try other styles
break
We're inside a comment, and we're trying to find the next lines of the comment
else: match = style.findnext(line, is_in_comment=True) if match: if match.__class__ == MatchEnd:
End of the comment has been matched. Two cases: the current line is part of the comment itself, or the current line is in fact part of the beginning of the next code block.
if match.iscomment: text += match.content blocks.append(Comment(wikiblock_indentation, full_filename, linestart, lineno, text)) text = "" linestart = lineno + 1 else: blocks.append(Comment(wikiblock_indentation, full_filename, linestart, lineno-1, text)) text = match.content linestart = lineno curstyle = None elif match.__class__ == MatchLineComment: # We matched one more line for the current # comment text += match.content
Nothing has been matched, we're in a code block.
if not match: assert curstyle is None text += line lineno += 1
Finalize the last block
if text != "": if curstyle: blocks.append(Comment(wikiblock_indentation, full_filename, linestart, lineno, text)) else: blocks.append(Code(full_filename, linestart, lineno, text, language)) return blocks
Parsing of source code files
This module allows to parse source code files, and to split each file into a list of blocks. Each block is represented by a
Blockobject, which can be eitherCodeorCommentobjects.