初学python,看《python基础教程》,第20章实现了将文本转化成html的功能。因为本人以前有DIY一个markdown转html的算法,因此对这个例子有兴趣。可仔细一看,发现很难看懂,一个功能分散在几个文件中,各个类的耦合很是紧。虽然本身有几年的c++开发经验,但初看这个python代码也以为头晕。html
如下是其源码python
1 from __future__ import generators 2 3 4 def lines(file): 5 for line in file: 6 yield line 7 yield '\n' 8 9 10 def blocks(file): 11 block = [] 12 for line in lines(file): 13 if line.strip(): 14 block.append(line) 15 elif block: 16 yield ''.join(block).strip() 17 block = []
# This Python file uses the following encoding: utf-8 class Rule: """ Base class for all rules. """ def action(self, block, handler): handler.start(self.type) handler.feed(block) handler.end(self.type) return True class HeadingRule(Rule): """ A heading is a single line that is at most 70 characters and that doesn't end with a colon. """ type = 'heading' def condition(self, block): return '\n' not in block and len(block) <= 70 and not block[-1] == ':' class TitleRule(HeadingRule): """ The title is the first block in the document, provided that it is a heading. """ type = 'title' first = True def condition(self, block): if not self.first: return False self.first = False return HeadingRule.condition(self, block) class ListItemRule(Rule): """ A list item is a paragraph that begins with a hyphen. As part of the formatting, the hyphen is removed. """ type = 'listitem' def condition(self, block): return block[0] == '-' def action(self, block, handler): handler.start(self.type) handler.feed(block[1:].strip()) handler.end(self.type) return 1 # start ListRule { class ListRule(ListItemRule): """ A list begins between a block that is not a list item and a subsequent list item. It ends after the last consecutive list item. """ type = 'list' inside = False def condition(self, block): # 总返回true,由于对每一个block都得进行检查 return True def action(self, block, handler): if not self.inside and ListItemRule.condition(self, block): handler.start(self.type) self.inside = True elif self.inside and not ListItemRule.condition(self, block): handler.end(self.type) self.inside = False # 总返回false,由于得让规则继续处理 return False # end ListRule } class ParagraphRule(Rule): """ A paragraph is simply a block that isn't covered by any of the other rules. """ type = 'paragraph' def condition(self, block): return True
1 # start Handler { 2 class Handler: 3 """ 4 An object that handles method calls from the Parser. 5 6 The Parser will call the start() and end() methods at the 7 beginning of each block, with the proper block name as 8 parameter. The sub() method will be used in regular expression 9 substitution. When called with a name such as 'emphasis', it will 10 return a proper substitution function. 11 """ 12 def callback(self, prefix, name, *args): 13 method = getattr(self, prefix+name, None) 14 if callable(method): 15 return method(*args) 16 17 def start(self, name): 18 self.callback('start_', name) 19 20 def end(self, name): 21 self.callback('end_', name) 22 23 def sub(self, name): 24 return lambda match: \ 25 self.callback('sub_', name, match) or match.group(0) 26 # end Handler } 27 28 29 # start HTMLHandler { 30 class HTMLHandler(Handler): 31 """ 32 A specific handler used for rendering HTML. 33 34 The methods in HTMLHandler are accessed from the superclass 35 Handler's start(), end(), and sub() methods. They implement basic 36 markup as used in HTML documents. 37 """ 38 def start_document(self): 39 print '<html><head><title>...</title></head><body>' 40 41 def end_document(self): 42 print '</body></html>' 43 44 def start_paragraph(self): 45 print '<p>' 46 47 def end_paragraph(self): 48 print '</p>' 49 50 def start_title(self): 51 print '<h1>' 52 53 def end_title(self): 54 print '</h1>' 55 56 def start_heading(self): 57 print '<h2>' 58 59 def end_heading(self): 60 print '</h2>' 61 62 def start_list(self): 63 print '<ul>' 64 65 def end_list(self): 66 print '</ul>' 67 68 def start_listitem(self): 69 print '<li>' 70 71 def end_listitem(self): 72 print '</li>' 73 74 def sub_emphasis(self, match): 75 return '<em>%s</em>' % match.group(1) 76 77 def sub_url(self, match): 78 return '<a href="%s">%s</a>' % (match.group(1), match.group(1)) 79 80 def sub_mail(self, match): 81 return '<a href="mailto:%s">%s</a>' % (match.group(1), match.group(1)) 82 83 def feed(self, data): 84 print data 85 86 # end HTMLHandler }
1 import sys 2 import re 3 from handlers import * 4 from util import * 5 from rules import * 6 7 8 # start Parser { 9 class Parser: 10 """ 11 A Parser reads a text file, applying rules and controlling a 12 handler. 13 """ 14 def __init__(self, handler): 15 self.handler = handler 16 self.rules = [] 17 self.filters = [] 18 19 def addRule(self, rule): 20 self.rules.append(rule) 21 22 def addFilter(self, pattern, name): 23 def filter(block, handler): 24 return re.sub(pattern, handler.sub(name), block) 25 self.filters.append(filter) 26 27 def parse(self, file): 28 self.handler.start('document') 29 30 for block in blocks(file): 31 for filter in self.filters: 32 block = filter(block, self.handler) 33 34 for rule in self.rules: 35 if rule.condition(block): 36 last = rule.action(block, self.handler) 37 if last: 38 break 39 self.handler.end('document') 40 # end Parser } 41 42 43 # start BaseTextParser { 44 class BasicTextParser(Parser): 45 """ 46 A specific Parser that adds rules and filters in its 47 constructor. 48 """ 49 def __init__(self, handler): 50 Parser.__init__(self, handler) 51 self.addRule(ListRule()) 52 self.addRule(ListItemRule()) 53 self.addRule(TitleRule()) 54 self.addRule(HeadingRule()) 55 self.addRule(ParagraphRule()) 56 57 self.addFilter(r'\*(.+?)\*', 'emphasis') 58 self.addFilter(r'(http://[\.a-zA-Z/]+)', 'url') 59 self.addFilter(r'([\.a-zA-Z]+@[\.a-zA-Z]+[a-zA-Z]+)', 'mail') 60 # end BaseTextParser } 61 62 handler = HTMLHandler() 63 parser = BasicTextParser(handler) 64 65 parser.parse(sys.stdin)
文本以下c++
Welcome to World Wide Spam, Inc.
These are the corporate web pages of *World Wide Spam*, Inc. We hope
you find your stay enjoyable, and that you will sample many of our
products.
A short history of the company
World Wide Spam was started in the summer of 2000. The business
concept was to ride the dot-com wave and to make money both through
bulk email and by selling canned meat online.
After receiving several complaints from customers who weren't
satisfied by their bulk email, World Wide Spam altered their profile,
and focused 100% on canned goods. Today, they rank as the world's
13,892nd online supplier of SPAM.
Destinations
From this page you may visit several of our interesting web pages:
- What is SPAM? (http://wwspam.fu/whatisspam)
- How do they make it? (http://wwspam.fu/howtomakeit)
- Why should I eat it? (http://wwspam.fu/whyeatit)
How to get in touch with us
You can get in touch with us in *many* ways: By phone (555-1234), by
email (wwspam@wwspam.fu) or by visiting our customer feedback page
(http://wwspam.fu/feedback).
使用命令行 python markup.py < test_input.txt > out.html 便可将文件转化为有格式的html文件web
上面代码有几点不足之处:算法
下面是本人改进后的代码express
1 from __future__ import generators 2 3 4 def lines(file): 5 for line in file: 6 yield line 7 yield '\n' 8 9 10 def lines2(file): 11 for line in file: 12 s = line.strip() 13 if s: 14 yield s 15 yield '\n' 16 17 18 def blocks(file): 19 block = [] 20 for line in lines(file): 21 if line.strip(): 22 block.append(line) 23 elif block: 24 yield ''.join(block).strip() 25 block = []
1 import re 2 3 4 def createFilter(pattern, fun): 5 def filter(line): 6 return re.sub(pattern, fun, line) 7 return filter 8 9 10 def filterEm(): 11 def subEm(match): 12 return '<em>%s</em>' % match.group(1) 13 return createFilter(r'\*(.+?)\*', subEm) 14 15 16 def filterUrl(): 17 def subUrl(match): 18 return '<a href="%s">%s</a>' % (match.group(1), match.group(1)) 19 return createFilter(r'(http://[\.a-zA-Z/]+)', subUrl) 20 21 22 def filterMail(): 23 def subMail(match): 24 return '<a href="mailto:%s">%s</a>' % (match.group(1), match.group(1)) 25 return createFilter(r'([\.a-zA-Z]+@[\.a-zA-Z]+[a-zA-Z]+)', subMail) 26 27 28 def createFilters(): 29 filters = [] 30 filters.append(filterEm()) 31 filters.append(filterUrl()) 32 filters.append(filterMail()) 33 return filters
1 # This Python file uses the following encoding: utf-8 2 class Rule: 3 def action(self, line): 4 self.start(line) 5 self.feed(line) 6 self.end(line) 7 return True 8 9 def start(self, line): 10 pass 11 12 def end(self, line): 13 pass 14 15 def feed(self, line): 16 print line 17 18 def endDoc(self): 19 pass 20 21 22 class HeadingRule(Rule): # {{{ 23 def condition(self, line): 24 return '\n' not in line and len(line) <= 30 and not line[-1] == ':' 25 26 def start(self, line): 27 print '<h2>' 28 29 def end(self, line): 30 print '</h2>' 31 32 33 class TitleRule(HeadingRule): 34 first = True 35 36 def condition(self, line): 37 if not self.first: 38 return False 39 self.first = False 40 return HeadingRule.condition(self, line) 41 42 def start(self, line): 43 print '<h1>' 44 45 def end(self, line): 46 print '</h1>' # }}} 47 48 49 class ListItemRule(Rule): # {{{ 50 def condition(self, line): 51 return line[0] == '-' 52 53 def feed(self, line): 54 print line[1:].strip() 55 56 def start(self, line): 57 print '<li>' 58 59 def end(self, line): 60 print '</li>' 61 62 63 class ListRule(ListItemRule): 64 inside = False 65 firstIn = False 66 firstOut = False 67 68 def condition(self, line): 69 return True 70 71 def action(self, line): 72 if not self.inside and ListItemRule.condition(self, line): 73 self.start(line) 74 self.inside = True 75 elif self.inside and not ListItemRule.condition(self, line): 76 self.end(line) 77 self.inside = False 78 return False 79 80 def start(self, line): 81 print '<ul>' 82 83 def end(self, line): 84 print '</ul>' 85 86 def feed(self, line): 87 pass # }}} 88 89 90 class ParagraphRule(Rule): 91 92 def condition(self, line): 93 return True 94 95 def start(self, line): 96 print '<p>' 97 98 def end(self, line): 99 print '</p>' 100 101 102 class DocumentRule(Rule): 103 first = True 104 isStart = False 105 106 def condition(self, line): 107 if self.first: 108 self.first = False 109 self.isStart = True 110 return True 111 return False 112 113 def action(self, line): 114 if self.isStart: 115 self.start(line) 116 self.isStart = False 117 return False 118 119 def start(self, line): 120 print '<html><head><title>...</title></head><body>' 121 122 def end(self, line): 123 print '</body></html>' 124 125 def endDoc(self): 126 self.end('')
1 # This Python file uses the following encoding: utf-8 2 from util import * 3 from rules import * 4 import re 5 import sys 6 7 8 class MyParser: 9 def __init__(self): 10 self.rules = [] 11 self.filters = [] 12 13 def addRule(self, rule): 14 self.rules.append(rule) 15 16 def setFilters(self, filters): 17 self.filters = filters 18 19 def parse(self, file): 20 for line in lines2(file): 21 22 for filter in self.filters: 23 line = filter(line) 24 25 for rule in self.rules: 26 if rule.condition(line): 27 last = rule.action(line) 28 if last: 29 break 30 31 # 文档结束后调用,以处理收尾工做 32 for rule in self.rules: 33 rule.endDoc()
1 from parsers import * 2 from util import * 3 from rules import * 4 from filters import * 5 import sys 6 7 8 p = MyParser() 9 p.addRule(DocumentRule()) 10 p.addRule(ListRule()) 11 p.addRule(ListItemRule()) 12 p.addRule(TitleRule()) 13 p.addRule(HeadingRule()) 14 p.addRule(ParagraphRule()) 15 p.setFilters(createFilters()) 16 17 p.parse(sys.stdin)
使用命令 python main.py < test_input.txt > out.html 运行windows
有以下几点改动:markdown
最后,代码应该写得容易让人看得懂 (尤为是在一本初始教程中)。app
ps: 本人接下来将用上面的框架用python写个markdown转html的算法,而后再将代码转化成c++代码。最后完善本身的笔记软件而且用Qt写个跨windows/mac平台的markdown的编辑器。框架