初學python,看《python基礎教程》,第20章實現了將文本轉化成html的功能。由於本人之前有DIY一個markdown轉html的算法,所以對這個例子有興趣。可仔細一看,發現很難看懂,一個功能分散在幾個文件中,各個類的耦合非常緊。雖然自己有幾年的c++開發經驗,但初看這個python代碼也覺得頭暈。
以下是其源碼

from __future__ import generators
def lines(file):
for line in file:
yield line
yield '\n'
def blocks(file):
block = []
for line in lines(file):
if line.strip():
block.append(line)
elif block:
yield ''.join(block).strip()
block = []
util.py

# This Python file uses the following encoding: utf-8
class Rule:
"""
Base class for all rules.
"""
def action(self, block, handler):
handler.start(self.type)
handler.feed(block)
handler.end(self.type)
return True
class HeadingRule(Rule):
"""
A heading is a single line that is at most 70 characters and
that doesn't end with a colon.
"""
type = 'heading'
def condition(self, block):
return '\n' not in block and len(block) <= 70 and not block[-1] == ':'
class TitleRule(HeadingRule):
"""
The title is the first block in the document, provided that it is
a heading.
"""
type = 'title'
first = True
def condition(self, block):
if not self.first:
return False
self.first = False
return HeadingRule.condition(self, block)
class ListItemRule(Rule):
"""
A list item is a paragraph that begins with a hyphen. As part of
the formatting, the hyphen is removed.
"""
type = 'listitem'
def condition(self, block):
return block[0] == '-'
def action(self, block, handler):
handler.start(self.type)
handler.feed(block[1:].strip())
handler.end(self.type)
return 1
# start ListRule {
class ListRule(ListItemRule):
"""
A list begins between a block that is not a list item and a
subsequent list item. It ends after the last consecutive list
item.
"""
type = 'list'
inside = False
def condition(self, block):
# 總返回true,因為對每個block都得進行檢查
return True
def action(self, block, handler):
if not self.inside and ListItemRule.condition(self, block):
handler.start(self.type)
self.inside = True
elif self.inside and not ListItemRule.condition(self, block):
handler.end(self.type)
self.inside = False
# 總返回false,因為得讓規則繼續處理
return False
# end ListRule }
class ParagraphRule(Rule):
"""
A paragraph is simply a block that isn't covered by any of the
other rules.
"""
type = 'paragraph'
def condition(self, block):
return True
rules.py

# start Handler {
class Handler:
"""
An object that handles method calls from the Parser.
The Parser will call the start() and end() methods at the
beginning of each block, with the proper block name as
parameter. The sub() method will be used in regular expression
substitution. When called with a name such as 'emphasis', it will
return a proper substitution function.
"""
def callback(self, prefix, name, *args):
method = getattr(self, prefix+name, None)
if callable(method):
return method(*args)
def start(self, name):
self.callback('start_', name)
def end(self, name):
self.callback('end_', name)
def sub(self, name):
return lambda match: self.callback('sub_', name, match) or match.group(0)
# end Handler }
# start HTMLHandler {
class HTMLHandler(Handler):
"""
A specific handler used for rendering HTML.
The methods in HTMLHandler are accessed from the superclass
Handler's start(), end(), and sub() methods. They implement basic
markup as used in HTML documents.
"""
def start_document(self):
print '<html><head><title>...</title></head><body>'
def end_document(self):
print '</body></html>'
def start_paragraph(self):
print '<p>'
def end_paragraph(self):
print '</p>'
def start_title(self):
print '<h1>'
def end_title(self):
print '</h1>'
def start_heading(self):
print '<h2>'
def end_heading(self):
print '</h2>'
def start_list(self):
print '<ul>'
def end_list(self):
print '</ul>'
def start_listitem(self):
print '<li>'
def end_listitem(self):
print '</li>'
def sub_emphasis(self, match):
return '<em>%s</em>' % match.group(1)
def sub_url(self, match):
return '<a href="%s">%s</a>' % (match.group(1), match.group(1))
def sub_mail(self, match):
return '<a href="mailto:%s">%s</a>' % (match.group(1), match.group(1))
def feed(self, data):
print data
# end HTMLHandler }
handles.py

import sys
import re
from handlers import *
from util import *
from rules import *
# start Parser {
class Parser:
"""
A Parser reads a text file, applying rules and controlling a
handler.
"""
def __init__(self, handler):
self.handler = handler
self.rules = []
self.filters = []
def addRule(self, rule):
self.rules.append(rule)
def addFilter(self, pattern, name):
def filter(block, handler):
return re.sub(pattern, handler.sub(name), block)
self.filters.append(filter)
def parse(self, file):
self.handler.start('document')
for block in blocks(file):
for filter in self.filters:
block = filter(block, self.handler)
for rule in self.rules:
if rule.condition(block):
last = rule.action(block, self.handler)
if last:
break
self.handler.end('document')
# end Parser }
# start BaseTextParser {
class BasicTextParser(Parser):
"""
A specific Parser that adds rules and filters in its
constructor.
"""
def __init__(self, handler):
Parser.__init__(self, handler)
self.addRule(ListRule())
self.addRule(ListItemRule())
self.addRule(TitleRule())
self.addRule(HeadingRule())
self.addRule(ParagraphRule())
self.addFilter(r'\*(.+?)\*', 'emphasis')
self.addFilter(r'(http://[\.a-zA-Z/]+)', 'url')
self.addFilter(r'([\.a-zA-Z]+@[\.a-zA-Z]+[a-zA-Z]+)', 'mail')
# end BaseTextParser }
handler = HTMLHandler()
parser = BasicTextParser(handler)
parser.parse(sys.stdin)
markup.py
文本如下

Welcome to World Wide Spam, Inc. These are the corporate web pages of *World Wide Spam*, Inc. We hope you find your stay enjoyable, and that you will sample many of our products. A short history of the company World Wide Spam was started in the summer of 2000. The business concept was to ride the dot-com wave and to make money both through bulk email and by selling canned meat online. After receiving several complaints from customers who weren't satisfied by their bulk email, World Wide Spam altered their profile, and focused 100% on canned goods. Today, they rank as the world's 13,892nd online supplier of SPAM. Destinations From this page you may visit several of our interesting web pages: - What is SPAM? (http://wwspam.fu/whatisspam) - How do they make it? (http://wwspam.fu/howtomakeit) - Why should I eat it? (http://wwspam.fu/whyeatit) How to get in touch with us You can get in touch with us in *many* ways: By phone (555-1234), by email (wwspam@wwspam.fu) or by visiting our customer feedback page (http://wwspam.fu/feedback).test_input.txt
使用命令行 python markup.py < test_input.txt > out.html 即可將文件轉化為有格式的html文件
上面代碼有幾點不足之處:
下面是本人改進後的代碼

from __future__ import generators
def lines(file):
for line in file:
yield line
yield '\n'
def lines2(file):
for line in file:
s = line.strip()
if s:
yield s
yield '\n'
def blocks(file):
block = []
for line in lines(file):
if line.strip():
block.append(line)
elif block:
yield ''.join(block).strip()
block = []
util.py

import re
def createFilter(pattern, fun):
def filter(line):
return re.sub(pattern, fun, line)
return filter
def filterEm():
def subEm(match):
return '<em>%s</em>' % match.group(1)
return createFilter(r'\*(.+?)\*', subEm)
def filterUrl():
def subUrl(match):
return '<a href="%s">%s</a>' % (match.group(1), match.group(1))
return createFilter(r'(http://[\.a-zA-Z/]+)', subUrl)
def filterMail():
def subMail(match):
return '<a href="mailto:%s">%s</a>' % (match.group(1), match.group(1))
return createFilter(r'([\.a-zA-Z]+@[\.a-zA-Z]+[a-zA-Z]+)', subMail)
def createFilters():
filters = []
filters.append(filterEm())
filters.append(filterUrl())
filters.append(filterMail())
return filters
filters.py

# This Python file uses the following encoding: utf-8
class Rule:
def action(self, line):
self.start(line)
self.feed(line)
self.end(line)
return True
def start(self, line):
pass
def end(self, line):
pass
def feed(self, line):
print line
def endDoc(self):
pass
class HeadingRule(Rule): # {{{
def condition(self, line):
return '\n' not in line and len(line) <= 30 and not line[-1] == ':'
def start(self, line):
print '<h2>'
def end(self, line):
print '</h2>'
class TitleRule(HeadingRule):
first = True
def condition(self, line):
if not self.first:
return False
self.first = False
return HeadingRule.condition(self, line)
def start(self, line):
print '<h1>'
def end(self, line):
print '</h1>' # }}}
class ListItemRule(Rule): # {{{
def condition(self, line):
return line[0] == '-'
def feed(self, line):
print line[1:].strip()
def start(self, line):
print '<li>'
def end(self, line):
print '</li>'
class ListRule(ListItemRule):
inside = False
firstIn = False
firstOut = False
def condition(self, line):
return True
def action(self, line):
if not self.inside and ListItemRule.condition(self, line):
self.start(line)
self.inside = True
elif self.inside and not ListItemRule.condition(self, line):
self.end(line)
self.inside = False
return False
def start(self, line):
print '<ul>'
def end(self, line):
print '</ul>'
def feed(self, line):
pass # }}}
class ParagraphRule(Rule):
def condition(self, line):
return True
def start(self, line):
print '<p>'
def end(self, line):
print '</p>'
class DocumentRule(Rule):
first = True
isStart = False
def condition(self, line):
if self.first:
self.first = False
self.isStart = True
return True
return False
def action(self, line):
if self.isStart:
self.start(line)
self.isStart = False
return False
def start(self, line):
print '<html><head><title>...</title></head><body>'
def end(self, line):
print '</body></html>'
def endDoc(self):
self.end('')
rules.py

# This Python file uses the following encoding: utf-8
from util import *
from rules import *
import re
import sys
class MyParser:
def __init__(self):
self.rules = []
self.filters = []
def addRule(self, rule):
self.rules.append(rule)
def setFilters(self, filters):
self.filters = filters
def parse(self, file):
for line in lines2(file):
for filter in self.filters:
line = filter(line)
for rule in self.rules:
if rule.condition(line):
last = rule.action(line)
if last:
break
# 文檔結束後調用,以處理收尾工作
for rule in self.rules:
rule.endDoc()
parsers.py

from parsers import * from util import * from rules import * from filters import * import sys p = MyParser() p.addRule(DocumentRule()) p.addRule(ListRule()) p.addRule(ListItemRule()) p.addRule(TitleRule()) p.addRule(HeadingRule()) p.addRule(ParagraphRule()) p.setFilters(createFilters()) p.parse(sys.stdin)main.py
使用命令 python main.py < test_input.txt > out.html 運行
有如下幾點改動:
最後,代碼應該寫得容易讓人看得懂 (尤其是在一本初始教程中)。
ps: 本人接下來將用上面的框架用python寫個markdown轉html的算法,然後再將代碼轉化成c++代碼。最後完善自己的筆記軟件並且用Qt寫個跨windows/mac平台的markdown的編輯器。