1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23 """module for parsing html files for translation"""
24
25 import re
26 from translate.storage import base
27 from HTMLParser import HTMLParser
28
30 """A unit of translatable/localisable HTML content"""
34
38
41 source = property(getsource, setsource)
42
44 self.locations.append(location)
45
48
49
50 -class htmlfile(HTMLParser, base.TranslationStore):
51 UnitClass = htmlunit
52 markingtags = ["p", "title", "h1", "h2", "h3", "h4", "h5", "h6", "th", "td", "div", "li", "dt", "dd", "address", "caption"]
53 markingattrs = []
54 includeattrs = ["alt", "summary", "standby", "abbr", "content"]
55
56 - def __init__(self, includeuntaggeddata=None, inputfile=None):
57 self.units = []
58 self.filename = getattr(inputfile, 'name', None)
59 self.currentblock = ""
60 self.currentblocknum = 0
61 self.currenttag = None
62 self.includeuntaggeddata = includeuntaggeddata
63 HTMLParser.__init__(self)
64
65 if inputfile is not None:
66 htmlsrc = inputfile.read()
67 inputfile.close()
68 self.parse(htmlsrc)
69
71 """Returns the encoding of the html text.
72
73 We look for 'charset=' within a meta tag to do this.
74 """
75
76 pattern = '''(?i)<meta.*content.*=.*charset.*=\\s*([^\\s]*)\\s*["']'''
77 result = re.findall(pattern, htmlsrc)
78 encoding = None
79 if result:
80 encoding = result[0]
81 return encoding
82
84 """Return the html text properly encoded based on a charset."""
85 charset = self.guess_encoding(htmlsrc)
86 if charset:
87 return htmlsrc.decode(charset)
88 else:
89 return htmlsrc
90
91 - def parse(self, htmlsrc):
92 htmlsrc = self.do_encoding(htmlsrc)
93 self.feed(htmlsrc)
94
101
103 """Strip unnecessary html from the text.
104
105 HTML tags are deemed unnecessary if it fully encloses the translatable
106 text, eg. '<a href="index.html">Home Page</a>'.
107
108 HTML tags that occurs within the normal flow of text will not be removed,
109 eg. 'This is a link to the <a href="index.html">Home Page</a>.'
110 """
111 text = text.strip()
112
113 pattern = '(?s)^<[^>]*>(.*)</.*>$'
114 result = re.findall(pattern, text)
115 if len(result) == 1:
116 text = self.strip_html(result[0])
117 return text
118
120 """Check if the supplied HTML snippet has any content that needs to be translated."""
121
122 text = text.strip()
123 result = re.findall('(?i).*(charset.*=.*)', text)
124 if len(result) == 1:
125 return False
126
127
128 if text == ' ':
129 return False
130
131 pattern = '<[^>]*>'
132 result = re.sub(pattern, '', text).strip()
133 if result:
134 return True
135 else:
136 return False
137
138
139
141 self.addhtmlblock(self.currentblock)
142 self.currentblock = ""
143 self.currenttag = tag
144
146 self.addhtmlblock(self.currentblock)
147 self.currentblock = ""
148 self.currenttag = None
149
151 newblock = 0
152 if tag in self.markingtags:
153 newblock = 1
154 for attrname, attrvalue in attrs:
155 if attrname in self.markingattrs:
156 newblock = 1
157 if attrname in self.includeattrs:
158 self.addhtmlblock(attrvalue)
159
160 if newblock:
161 self.startblock(tag)
162 elif self.currenttag is not None:
163 self.currentblock += self.get_starttag_text()
164
166 for attrname, attrvalue in attrs:
167 if attrname in self.includeattrs:
168 self.addhtmlblock(attrvalue)
169 if self.currenttag is not None:
170 self.currentblock += self.get_starttag_text()
171
173 if tag == self.currenttag:
174 self.endblock()
175 elif self.currenttag is not None:
176 self.currentblock += '</%s>' % tag
177
179 if self.currenttag is not None:
180 self.currentblock += data
181 elif self.includeuntaggeddata:
182 self.startblock(None)
183 self.currentblock += data
184
187
190
194
197