1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22 """This module contains all the common features for languages.
23
24 Supported features:
25 language code (km, af)
26 language name (Khmer, Afrikaans)
27 Plurals
28 Number of plurals (nplurals)
29 Plural equation
30 pofilter tests to ignore
31
32 Segmentation
33 characters
34 words
35 sentences
36
37 TODO:
38 Ideas for possible features:
39
40 Language-Team information
41
42 Segmentation
43 phrases
44
45 Punctuation
46 End of sentence
47 Start of sentence
48 Middle of sentence
49 Quotes
50 single
51 double
52
53 Valid characters
54 Accelerator characters
55 Special characters
56 Direction (rtl or ltr)
57 """
58
59 from translate.lang import data
60 import re
61
63 """This class is the common parent class for all language classes."""
64
65 code = ""
66 """The ISO 639 language code, possibly with a country specifier or other
67 modifier.
68
69 Examples:
70 km
71 pt_BR
72 sr_YU@Latn
73 """
74
75 fullname = ""
76 """The full (English) name of this language.
77
78 Dialect codes should have the form of
79 Khmer
80 Portugese (Brazil)
81 #TODO: sr_YU@Latn?
82 """
83
84 nplurals = 0
85 """The number of plural forms of this language.
86
87 0 is not a valid value - it must be overridden.
88 Any positive integer is valid (it should probably be between 1 and 6)
89 Also see data.py
90 """
91
92 pluralequation = "0"
93 """The plural equation for selection of plural forms.
94
95 This is used for PO files to fill into the header.
96 See U{http://www.gnu.org/software/gettext/manual/html_node/gettext_150.html}.
97 Also see data.py
98 """
99
100 listseperator = u", "
101 """This string is used to seperate lists of textual elements. Most
102 languages probably can stick with the default comma, but Arabic and some
103 Asian languages might want to override this."""
104
105 commonpunc = u".,;:!?-@#$%^*_()[]{}/\\'`\"<>"
106 """These punctuation marks are common in English and most languages that
107 use latin script."""
108
109 quotes = u"‘’‛“”„‟′″‴‵‶‷‹›«»"
110 """These are different quotation marks used by various languages."""
111
112 invertedpunc = u"¿¡"
113 """Inveted punctuation sometimes used at the beginning of sentences in
114 Spanish, Asturian, Galician, and Catalan."""
115
116 rtlpunc = u"،؟؛÷"
117 """These punctuation marks are used by Arabic and Persian, for example."""
118
119 CJKpunc = u"。、,;!?「」『』【】"
120 """These punctuation marks are used in certain circumstances with CJK
121 languages."""
122
123 indicpunc = u"।॥॰"
124 """These punctuation marks are used by several Indic languages."""
125
126 ethiopicpunc = u"።፤፣"
127 """These punctuation marks are used by several Ethiopic languages."""
128
129 miscpunc = u"…±°¹²³·©®×£¥€"
130 """The middle dot (·) is used by Greek and Georgian."""
131
132 punctuation = u"".join([commonpunc, quotes, invertedpunc, rtlpunc, CJKpunc,\
133 indicpunc, ethiopicpunc, miscpunc])
134 """We include many types of punctuation here, simply since this is only
135 meant to determine if something is punctuation. Hopefully we catch some
136 languages which might not be represented with modules. Most languages won't
137 need to override this."""
138
139 sentenceend = u".!?…։؟।。!?።"
140 """These marks can indicate a sentence end. Once again we try to account
141 for many languages. Most langauges won't need to override this."""
142
143
144
145
146
147 sentencere = re.compile(r"""(?s) #make . also match newlines
148 .*? #anything, but match non-greedy
149 [%s] #the puntuation for sentence ending
150 \s+ #the spacing after the puntuation
151 (?=[^a-z\d])#lookahead that next part starts with caps
152 """ % sentenceend, re.VERBOSE)
153
154 puncdict = {}
155 """A dictionary of punctuation transformation rules that can be used by punctranslate()."""
156
157 ignoretests = []
158 """List of pofilter tests for this language that must be ignored."""
159
160 checker = None
161 """A language specific checker (see filters.checks).
162
163 This doesn't need to be supplied, but will be used if it exists."""
164
166 """This constructor is used if we need to instantiate an abject (not
167 the usual setup). This will mostly when the factory is asked for a
168 language for which we don't have a dedicated class."""
169 self.code = code or ""
170 while code:
171 langdata = data.languages.get(code, None)
172 if langdata:
173 self.fullname, self.nplurals, self.pluralequation = langdata
174 break
175 code = data.simplercode(code)
176 if not code:
177
178 pass
179
181 """Give a simple string representation without address information to
182 be able to store it in text for comparison later."""
183 detail = ""
184 if self.code:
185 detail = "(%s)" % self.code
186 return "<class 'translate.lang.common.Common%s'>" % detail
187
199 punctranslate = classmethod(punctranslate)
200
202 """Returns an iterator over the characters in text."""
203
204 prev = 'A'
205 for c in text:
206 if c.isspace() and prev.isspace():
207 continue
208 prev = c
209 if not (c in cls.punctuation):
210 yield c
211 character_iter = classmethod(character_iter)
212
216 characters = classmethod(characters)
217
219 """Returns an iterator over the words in text."""
220
221 for w in text.split():
222 word = w.strip(cls.punctuation)
223 if word:
224 yield word
225 word_iter = classmethod(word_iter)
226
228 """Returns a list of words in text."""
229 return [w for w in cls.word_iter(text)]
230 words = classmethod(words)
231
233 """Returns an iterator over the sentences in text."""
234 lastmatch = 0
235 iter = cls.sentencere.finditer(text)
236 for item in iter:
237 lastmatch = item.end()
238 sentence = item.group()
239 if strip: sentence = sentence.strip()
240 if sentence: yield sentence
241 remainder = text[lastmatch:]
242 if strip: remainder = remainder.strip()
243 if remainder: yield remainder
244 sentence_iter = classmethod(sentence_iter)
245
247 """Returns a list of senteces in text."""
248 return [s for s in cls.sentence_iter(text, strip=strip)]
249 sentences = classmethod(sentences)
250
252 """Determines whether the text starts with a capital letter."""
253 stripped = text.lstrip().lstrip(cls.punctuation)
254 return stripped and stripped[0].isupper()
255 capsstart = classmethod(capsstart)
256