1 | #!/usr/bin/env python |
---|
2 | # coding: utf-8 |
---|
3 | |
---|
4 | import sys, os, re, json, errno, codecs, itertools |
---|
5 | from collections import OrderedDict |
---|
6 | from xml.dom import minidom, Node |
---|
7 | |
---|
8 | output_dir = "../wwwnew" |
---|
9 | locales = ("ar", "bg", "de", "en", "es", "fr", "he", "hu", "ko", "lt", "nl", "pt_BR", "ru", "sk", "zh_CN", "zh_TW") |
---|
10 | |
---|
11 | def ensure_dir(path): |
---|
12 | try: |
---|
13 | os.makedirs(os.path.dirname(path)) |
---|
14 | except OSError, e: |
---|
15 | if e.errno != errno.EEXIST: |
---|
16 | raise |
---|
17 | |
---|
18 | def read_xml(path): |
---|
19 | with open(path, "rb") as handle: |
---|
20 | xml = handle.read() |
---|
21 | xml = re.sub(r"(?<!&)&(?!#?\w+;|&)", "&", xml) |
---|
22 | xml = xml.replace(' href="en/', ' href="') |
---|
23 | xml = xml.replace(' href="en"', ' href="index"') |
---|
24 | xml = xml.replace(' src="en/', ' src="') |
---|
25 | return minidom.parseString("<!DOCTYPE root [<!ENTITY mdash \"—\"><!ENTITY nbsp \" \">]><root>%s</root>" % xml) |
---|
26 | |
---|
27 | def save_locale(path, data): |
---|
28 | ensure_dir(path) |
---|
29 | with codecs.open(path, "wb", encoding="utf-8") as handle: |
---|
30 | json.dump(data, handle, ensure_ascii=False, indent=2, separators=(',', ': ')) |
---|
31 | |
---|
32 | def get_text(node): |
---|
33 | result = [] |
---|
34 | for child in node.childNodes: |
---|
35 | if child.nodeType != Node.TEXT_NODE: |
---|
36 | print child.tagName |
---|
37 | raise Exception("Unexpected node type %i" % child.nodeType) |
---|
38 | result.append(child.nodeValue) |
---|
39 | return "".join(result) |
---|
40 | |
---|
41 | def get_element(node, tagName, *args): |
---|
42 | for child in node.childNodes: |
---|
43 | if child.nodeType == Node.ELEMENT_NODE and child.tagName == tagName: |
---|
44 | if args: |
---|
45 | return get_element(child, *args) |
---|
46 | else: |
---|
47 | return child |
---|
48 | return None |
---|
49 | |
---|
50 | def merge_children(nodes): |
---|
51 | def is_text(node): |
---|
52 | if node.nodeType == Node.TEXT_NODE: |
---|
53 | return True |
---|
54 | if (node.nodeType == Node.ELEMENT_NODE and |
---|
55 | node.tagName in ("em", "strong") and |
---|
56 | len(node.attributes) == 0 and |
---|
57 | len(node.childNodes) == 1 and |
---|
58 | node.firstChild.nodeType == Node.TEXT_NODE): |
---|
59 | return True |
---|
60 | if (node.nodeType == Node.ELEMENT_NODE and |
---|
61 | node.tagName == "a" and |
---|
62 | len(node.attributes) == 1 and |
---|
63 | node.hasAttribute("href") and |
---|
64 | len(node.childNodes) == 1 and |
---|
65 | node.firstChild.nodeType == Node.TEXT_NODE): |
---|
66 | return True |
---|
67 | return False |
---|
68 | |
---|
69 | def is_empty(node): |
---|
70 | return node.nodeType == Node.TEXT_NODE and not node.nodeValue.strip() |
---|
71 | |
---|
72 | i = 0 |
---|
73 | en = nodes["en"] |
---|
74 | start = None |
---|
75 | for i in range(len(en.childNodes) + 1): |
---|
76 | if start == None: |
---|
77 | if i < len(en.childNodes) and is_text(en.childNodes[i]): |
---|
78 | start = i |
---|
79 | elif i >= len(en.childNodes) or not is_text(en.childNodes[i]): |
---|
80 | end = i - 1 |
---|
81 | while start < end and is_empty(en.childNodes[start]): |
---|
82 | start += 1 |
---|
83 | while start < end and is_empty(en.childNodes[end]): |
---|
84 | end -= 1 |
---|
85 | if start < end: |
---|
86 | for locale, parent in nodes.iteritems(): |
---|
87 | if end < len(parent.childNodes): |
---|
88 | text = [] |
---|
89 | links = [] |
---|
90 | for child in parent.childNodes[start:end+1]: |
---|
91 | if child.nodeType == Node.ELEMENT_NODE and child.tagName == "a": |
---|
92 | links.append(child.getAttribute("href")) |
---|
93 | child.removeAttribute("href") |
---|
94 | text.append(child.toxml()) |
---|
95 | node = parent.ownerDocument.createTextNode("".join(text)) |
---|
96 | node.links = links |
---|
97 | parent.replaceChild(node, parent.childNodes[start]) |
---|
98 | for child in parent.childNodes[start+1:end+1]: |
---|
99 | parent.removeChild(child) |
---|
100 | else: |
---|
101 | while start < len(parent.childNodes): |
---|
102 | parent.removeChild(parent.childNodes[start]) |
---|
103 | i -= end - start |
---|
104 | start = None |
---|
105 | |
---|
106 | def process_body(nodes, strings, counter=1): |
---|
107 | if nodes["en"].nodeType == Node.ELEMENT_NODE: |
---|
108 | if nodes["en"].tagName not in ("style", "script", "fix"): |
---|
109 | merge_children(nodes) |
---|
110 | for i in range(len(nodes["en"].childNodes)): |
---|
111 | new_nodes = {} |
---|
112 | for locale, value in nodes.iteritems(): |
---|
113 | if len(value.childNodes) > i: |
---|
114 | new_nodes[locale] = value.childNodes[i] |
---|
115 | counter = process_body(new_nodes, strings, counter) |
---|
116 | elif nodes["en"].nodeType == Node.TEXT_NODE: |
---|
117 | if nodes["en"].nodeValue.strip(): |
---|
118 | if hasattr(nodes["en"], "links") and len(nodes["en"].links): |
---|
119 | links = "(%s)" % ", ".join(nodes["en"].links) |
---|
120 | else: |
---|
121 | links = "" |
---|
122 | for locale, value in nodes.iteritems(): |
---|
123 | text = value.nodeValue or "" |
---|
124 | pre, text, post = re.search(r"^(\s*)(.*?)(\s*)$", text, re.S).groups() |
---|
125 | if text and text.find("[untr]") < 0: |
---|
126 | strings[locale]["s%i" % counter] = {"message": text} |
---|
127 | value.nodeValue = "%s$s%i%s$%s" % (pre, counter, links, post) |
---|
128 | counter += 1 |
---|
129 | else: |
---|
130 | print >>sys.stderr, "Unexpected node type %i" % nodes["en"].nodeType |
---|
131 | |
---|
132 | return counter |
---|
133 | |
---|
134 | def process_page(path, menu): |
---|
135 | pagename = os.path.join(os.path.dirname(path), os.path.basename(path).replace("page!", "")) |
---|
136 | if "/" not in pagename: |
---|
137 | pagename = os.path.join(pagename, "index") |
---|
138 | format = "page!%s" |
---|
139 | else: |
---|
140 | format = "%s/" + path.split("/", 1)[1] |
---|
141 | pagename = pagename.split("/", 1)[1] |
---|
142 | |
---|
143 | data = {} |
---|
144 | strings = {} |
---|
145 | for locale in locales: |
---|
146 | if not os.path.exists(format % locale): |
---|
147 | continue |
---|
148 | data[locale] = read_xml(format % locale) |
---|
149 | strings[locale] = OrderedDict() |
---|
150 | |
---|
151 | for locale in data.iterkeys(): |
---|
152 | title = get_text(get_element(data[locale].documentElement, "title", "anwv")).strip() |
---|
153 | if title and title.find("[untr]") < 0: |
---|
154 | strings[locale]["title"] = {"message": title} |
---|
155 | |
---|
156 | titlestring = "title" |
---|
157 | if pagename in menu["en"]: |
---|
158 | if menu["en"][pagename]["message"] != strings["en"]["title"]["message"]: |
---|
159 | titlestring = "title-full" |
---|
160 | for locale in locales: |
---|
161 | if locale in strings and "title" in strings[locale]: |
---|
162 | title = strings[locale]["title"] |
---|
163 | if locale in menu and pagename in menu[locale]: |
---|
164 | strings[locale]["title"] = menu[locale][pagename] |
---|
165 | else: |
---|
166 | del strings[locale]["title"] |
---|
167 | strings[locale]["title-full"] = title |
---|
168 | for locale in locales: |
---|
169 | if locale in menu and pagename in locale: |
---|
170 | del menu[locale][pagename] |
---|
171 | |
---|
172 | bodies = {} |
---|
173 | for locale, value in data.iteritems(): |
---|
174 | bodies[locale] = get_element(value.documentElement, "body", "anwv") |
---|
175 | if pagename == "index": |
---|
176 | container = get_element(bodies[locale], "div") |
---|
177 | container.setAttribute("id", "install-container") |
---|
178 | container.setAttribute("class", "{{page}}") |
---|
179 | process_body(bodies, strings) |
---|
180 | |
---|
181 | body = re.sub(r"</?anwv/?>", "", bodies["en"].toxml()) |
---|
182 | head = re.sub(r"</?anwv/?>", "", get_element(data["en"].documentElement, "head", "anwv").toxml()) |
---|
183 | if head: |
---|
184 | pagedata = "<head>%s</head>%s" % (head, body) |
---|
185 | else: |
---|
186 | pagedata = body |
---|
187 | |
---|
188 | pagedata = pagedata.replace("/_override-static/global/global", "") |
---|
189 | pagedata = re.sub(r"</?fix/?>", "", pagedata, flags=re.S) |
---|
190 | |
---|
191 | # <foo><attr name="bar">test</attr> => <foo bar="test"> |
---|
192 | pagedata = re.sub(r'>\s*<attr\s+name="(\w+)">([^"<>]*)</attr\b', r' \1="\2"', pagedata, flags=re.S) |
---|
193 | |
---|
194 | # <script src=""/> => <script src=""></script> |
---|
195 | pagedata = re.sub(r'<((?!link\b|meta\b|br\b|col\b|base\b|img\b|param\b|area\b|hr\b|input\b)([\w:]+)\b[^<>]*)/>', r'<\1></\2>', pagedata, flags=re.S) |
---|
196 | |
---|
197 | # <img src="foo"/> => <img src="foo"> |
---|
198 | pagedata = re.sub(r'\/>', r'>', pagedata) |
---|
199 | |
---|
200 | # <img src="foo">dummy</img> => <img src="foo"> |
---|
201 | pagedata = re.sub(r'<((link|meta|br|col|base|img|param|area|hr|input)\b[^<>]*)>([^<>]*)</\2>', r'<\1>', pagedata, flags=re.S) |
---|
202 | |
---|
203 | if pagename == "index": |
---|
204 | def translate_tag(match): |
---|
205 | return r'{{"%s"|translate(links=[%s])}}' % (match.group(1), '"%s"' % '", "'.join(match.group(2).split(", "))) |
---|
206 | |
---|
207 | pagedata = re.sub(r"\$([\w\-]+)\$", r'{{"\1"|translate}}', pagedata) |
---|
208 | pagedata = re.sub(r"\$([\w\-]+)\((.*?)\)\$", lambda match: translate_tag(match), pagedata) |
---|
209 | pagedata = "noheading=True\n\n%s" % pagedata |
---|
210 | elif titlestring != "title": |
---|
211 | pagedata = "title=%s\n\n%s" % (titlestring, pagedata) |
---|
212 | |
---|
213 | if pagename == "index": |
---|
214 | target = os.path.join(output_dir, "includes", pagename + ".tmpl") |
---|
215 | else: |
---|
216 | target = os.path.join(output_dir, "pages", pagename + ".raw") |
---|
217 | ensure_dir(target) |
---|
218 | with codecs.open(target, "wb", encoding="utf-8") as handle: |
---|
219 | handle.write(pagedata) |
---|
220 | |
---|
221 | for locale, value in strings.iteritems(): |
---|
222 | if value: |
---|
223 | localefile = os.path.join(output_dir, "locales", locale, pagename + ".json") |
---|
224 | save_locale(localefile, value) |
---|
225 | |
---|
226 | def process_image(path): |
---|
227 | if path.startswith("en/"): |
---|
228 | target = os.path.join(output_dir, "locales", os.path.dirname(path), os.path.basename(path).replace("image!", "")) |
---|
229 | else: |
---|
230 | target = os.path.join(output_dir, "static", os.path.dirname(path), os.path.basename(path).replace("image!", "")) |
---|
231 | with open(path, "rb") as handle: |
---|
232 | data = handle.read() |
---|
233 | |
---|
234 | ensure_dir(target) |
---|
235 | with open(target, "wb") as handle: |
---|
236 | handle.write(data) |
---|
237 | |
---|
238 | def process_file(path, menu): |
---|
239 | if os.path.basename(path) in ("page!footer", "page!internet-explorer", "page!contribute-old"): |
---|
240 | return |
---|
241 | |
---|
242 | if os.path.basename(path).startswith("page!"): |
---|
243 | process_page(path, menu) |
---|
244 | elif os.path.basename(path).startswith("image!"): |
---|
245 | process_image(path) |
---|
246 | else: |
---|
247 | print >>sys.stderr, "Ignoring file %s" % path |
---|
248 | |
---|
249 | def process(path, menu): |
---|
250 | if os.path.isfile(path): |
---|
251 | process_file(path, menu) |
---|
252 | elif os.path.isdir(path): |
---|
253 | for filename in os.listdir(path): |
---|
254 | process(os.path.join(path, filename), menu) |
---|
255 | else: |
---|
256 | print >>sys.stderr, "Ignoring file %s" % path |
---|
257 | |
---|
258 | def process_menu(): |
---|
259 | menu = {} |
---|
260 | |
---|
261 | menu_format = "%s/_include/menu!menu" |
---|
262 | footer_format = "%s/_include/page!footer" |
---|
263 | for locale in locales: |
---|
264 | menu[locale] = OrderedDict() |
---|
265 | if os.path.exists(menu_format % locale): |
---|
266 | data = read_xml(menu_format % locale) |
---|
267 | items = get_element(data.documentElement, "items") |
---|
268 | for node in items.childNodes: |
---|
269 | text = get_text(get_element(node, "mainlink", "anwv", "title", "anwv")).strip() |
---|
270 | url = get_text(get_element(node, "mainlink", "anwv", "url", "anwv")).strip() |
---|
271 | if url == "en": |
---|
272 | string = "installation" |
---|
273 | elif url.startswith("en/"): |
---|
274 | string = url.replace("en/", "") |
---|
275 | elif url == "/languages/": |
---|
276 | continue # Unused string |
---|
277 | elif url == "/search/": |
---|
278 | string = "search" |
---|
279 | else: |
---|
280 | raise Exception("Unexpected URL in menu: %s" % url) |
---|
281 | if text and text.find("[untr]") < 0: |
---|
282 | menu[locale][string] = {"message": text} |
---|
283 | if os.path.exists(footer_format % locale): |
---|
284 | data = read_xml(footer_format % locale) |
---|
285 | for string, heading in itertools.izip(("resources", "community", "development", "follow-us"), data.getElementsByTagName("h1")): |
---|
286 | text = get_text(heading).strip() |
---|
287 | if text and text.find("[untr]") < 0: |
---|
288 | menu[locale][string] = {"message": text} |
---|
289 | for link in data.getElementsByTagName("a"): |
---|
290 | url = link.getAttribute("href").replace("/de/", "") |
---|
291 | text = get_text(link).strip() |
---|
292 | if url == "/forum/viewforum.php?f=11": |
---|
293 | string = "_bugs" |
---|
294 | elif url.startswith("/"): |
---|
295 | string = url.strip("/").split("/")[-1] |
---|
296 | else: |
---|
297 | string = url |
---|
298 | if text and text.find("[untr]") < 0: |
---|
299 | menu[locale][string] = {"message": text} |
---|
300 | return menu |
---|
301 | |
---|
302 | menu = process_menu() |
---|
303 | process("page!en", menu) |
---|
304 | process("en", menu) |
---|
305 | process("images", menu) |
---|
306 | |
---|
307 | for locale, value in menu.iteritems(): |
---|
308 | if "_bugs" in value: |
---|
309 | value["bugs"] = value["_bugs"] |
---|
310 | del value["_bugs"] |
---|
311 | localefile = os.path.join(output_dir, "locales", locale, "menu.json") |
---|
312 | save_locale(localefile, value) |
---|