Context Navigation

Back to Ticket #1170

Ticket #1170: convert.py

File convert.py, 11.2 KB (added by trev, on 08/01/2014 at 10:59:53 AM)
Conversion script

Line
1	#!/usr/bin/env python
2	# coding: utf-8
3
4	import sys, os, re, json, errno, codecs, itertools
5	from collections import OrderedDict
6	from xml.dom import minidom, Node
7
8	output_dir = "../wwwnew"
9	locales = ("ar", "bg", "de", "en", "es", "fr", "he", "hu", "ko", "lt", "nl", "pt_BR", "ru", "sk", "zh_CN", "zh_TW")
10
11	def ensure_dir(path):
12	try:
13	os.makedirs(os.path.dirname(path))
14	except OSError, e:
15	if e.errno != errno.EEXIST:
16	raise
17
18	def read_xml(path):
19	with open(path, "rb") as handle:
20	xml = handle.read()
21	xml = re.sub(r"(?<!&)&(?!#?\w+;\|&)", "&", xml)
22	xml = xml.replace(' href="en/', ' href="')
23	xml = xml.replace(' href="en"', ' href="index"')
24	xml = xml.replace(' src="en/', ' src="')
25	return minidom.parseString("<!DOCTYPE root [<!ENTITY mdash \"—\"><!ENTITY nbsp \" \">]><root>%s</root>" % xml)
26
27	def save_locale(path, data):
28	ensure_dir(path)
29	with codecs.open(path, "wb", encoding="utf-8") as handle:
30	json.dump(data, handle, ensure_ascii=False, indent=2, separators=(',', ': '))
31
32	def get_text(node):
33	result = []
34	for child in node.childNodes:
35	if child.nodeType != Node.TEXT_NODE:
36	print child.tagName
37	raise Exception("Unexpected node type %i" % child.nodeType)
38	result.append(child.nodeValue)
39	return "".join(result)
40
41	def get_element(node, tagName, *args):
42	for child in node.childNodes:
43	if child.nodeType == Node.ELEMENT_NODE and child.tagName == tagName:
44	if args:
45	return get_element(child, *args)
46	else:
47	return child
48	return None
49
50	def merge_children(nodes):
51	def is_text(node):
52	if node.nodeType == Node.TEXT_NODE:
53	return True
54	if (node.nodeType == Node.ELEMENT_NODE and
55	node.tagName in ("em", "strong") and
56	len(node.attributes) == 0 and
57	len(node.childNodes) == 1 and
58	node.firstChild.nodeType == Node.TEXT_NODE):
59	return True
60	if (node.nodeType == Node.ELEMENT_NODE and
61	node.tagName == "a" and
62	len(node.attributes) == 1 and
63	node.hasAttribute("href") and
64	len(node.childNodes) == 1 and
65	node.firstChild.nodeType == Node.TEXT_NODE):
66	return True
67	return False
68
69	def is_empty(node):
70	return node.nodeType == Node.TEXT_NODE and not node.nodeValue.strip()
71
72	i = 0
73	en = nodes["en"]
74	start = None
75	for i in range(len(en.childNodes) + 1):
76	if start == None:
77	if i < len(en.childNodes) and is_text(en.childNodes[i]):
78	start = i
79	elif i >= len(en.childNodes) or not is_text(en.childNodes[i]):
80	end = i - 1
81	while start < end and is_empty(en.childNodes[start]):
82	start += 1
83	while start < end and is_empty(en.childNodes[end]):
84	end -= 1
85	if start < end:
86	for locale, parent in nodes.iteritems():
87	if end < len(parent.childNodes):
88	text = []
89	links = []
90	for child in parent.childNodes[start:end+1]:
91	if child.nodeType == Node.ELEMENT_NODE and child.tagName == "a":
92	links.append(child.getAttribute("href"))
93	child.removeAttribute("href")
94	text.append(child.toxml())
95	node = parent.ownerDocument.createTextNode("".join(text))
96	node.links = links
97	parent.replaceChild(node, parent.childNodes[start])
98	for child in parent.childNodes[start+1:end+1]:
99	parent.removeChild(child)
100	else:
101	while start < len(parent.childNodes):
102	parent.removeChild(parent.childNodes[start])
103	i -= end - start
104	start = None
105
106	def process_body(nodes, strings, counter=1):
107	if nodes["en"].nodeType == Node.ELEMENT_NODE:
108	if nodes["en"].tagName not in ("style", "script", "fix"):
109	merge_children(nodes)
110	for i in range(len(nodes["en"].childNodes)):
111	new_nodes = {}
112	for locale, value in nodes.iteritems():
113	if len(value.childNodes) > i:
114	new_nodes[locale] = value.childNodes[i]
115	counter = process_body(new_nodes, strings, counter)
116	elif nodes["en"].nodeType == Node.TEXT_NODE:
117	if nodes["en"].nodeValue.strip():
118	if hasattr(nodes["en"], "links") and len(nodes["en"].links):
119	links = "(%s)" % ", ".join(nodes["en"].links)
120	else:
121	links = ""
122	for locale, value in nodes.iteritems():
123	text = value.nodeValue or ""
124	pre, text, post = re.search(r"^(\s)(.?)(\s*)$", text, re.S).groups()
125	if text and text.find("[untr]") < 0:
126	strings[locale]["s%i" % counter] = {"message": text}
127	value.nodeValue = "%s$s%i%s$%s" % (pre, counter, links, post)
128	counter += 1
129	else:
130	print >>sys.stderr, "Unexpected node type %i" % nodes["en"].nodeType
131
132	return counter
133
134	def process_page(path, menu):
135	pagename = os.path.join(os.path.dirname(path), os.path.basename(path).replace("page!", ""))
136	if "/" not in pagename:
137	pagename = os.path.join(pagename, "index")
138	format = "page!%s"
139	else:
140	format = "%s/" + path.split("/", 1)[1]
141	pagename = pagename.split("/", 1)[1]
142
143	data = {}
144	strings = {}
145	for locale in locales:
146	if not os.path.exists(format % locale):
147	continue
148	data[locale] = read_xml(format % locale)
149	strings[locale] = OrderedDict()
150
151	for locale in data.iterkeys():
152	title = get_text(get_element(data[locale].documentElement, "title", "anwv")).strip()
153	if title and title.find("[untr]") < 0:
154	strings[locale]["title"] = {"message": title}
155
156	titlestring = "title"
157	if pagename in menu["en"]:
158	if menu["en"][pagename]["message"] != strings["en"]["title"]["message"]:
159	titlestring = "title-full"
160	for locale in locales:
161	if locale in strings and "title" in strings[locale]:
162	title = strings[locale]["title"]
163	if locale in menu and pagename in menu[locale]:
164	strings[locale]["title"] = menu[locale][pagename]
165	else:
166	del strings[locale]["title"]
167	strings[locale]["title-full"] = title
168	for locale in locales:
169	if locale in menu and pagename in locale:
170	del menu[locale][pagename]
171
172	bodies = {}
173	for locale, value in data.iteritems():
174	bodies[locale] = get_element(value.documentElement, "body", "anwv")
175	if pagename == "index":
176	container = get_element(bodies[locale], "div")
177	container.setAttribute("id", "install-container")
178	container.setAttribute("class", "{{page}}")
179	process_body(bodies, strings)
180
181	body = re.sub(r"</?anwv/?>", "", bodies["en"].toxml())
182	head = re.sub(r"</?anwv/?>", "", get_element(data["en"].documentElement, "head", "anwv").toxml())
183	if head:
184	pagedata = "<head>%s</head>%s" % (head, body)
185	else:
186	pagedata = body
187
188	pagedata = pagedata.replace("/_override-static/global/global", "")
189	pagedata = re.sub(r"</?fix/?>", "", pagedata, flags=re.S)
190
191	# <foo><attr name="bar">test</attr> => <foo bar="test">
192	pagedata = re.sub(r'>\s<attr\s+name="(\w+)">([^"<>])</attr\b', r' \1="\2"', pagedata, flags=re.S)
193
194	# <script src=""/> => <script src=""></script>
195	pagedata = re.sub(r'<((?!link\b\|meta\b\|br\b\|col\b\|base\b\|img\b\|param\b\|area\b\|hr\b\|input\b)([\w:]+)\b[^<>]*)/>', r'<\1></\2>', pagedata, flags=re.S)
196
197	# <img src="foo"/> => <img src="foo">
198	pagedata = re.sub(r'\/>', r'>', pagedata)
199
200	# <img src="foo">dummy</img> => <img src="foo">
201	pagedata = re.sub(r'<((link\|meta\|br\|col\|base\|img\|param\|area\|hr\|input)\b[^<>])>([^<>])</\2>', r'<\1>', pagedata, flags=re.S)
202
203	if pagename == "index":
204	def translate_tag(match):
205	return r'{{"%s"\|translate(links=[%s])}}' % (match.group(1), '"%s"' % '", "'.join(match.group(2).split(", ")))
206
207	pagedata = re.sub(r"\$([\w\-]+)\$", r'{{"\1"\|translate}}', pagedata)
208	pagedata = re.sub(r"\$([\w\-]+)$(.*?)$\$", lambda match: translate_tag(match), pagedata)
209	pagedata = "noheading=True\n\n%s" % pagedata
210	elif titlestring != "title":
211	pagedata = "title=%s\n\n%s" % (titlestring, pagedata)
212
213	if pagename == "index":
214	target = os.path.join(output_dir, "includes", pagename + ".tmpl")
215	else:
216	target = os.path.join(output_dir, "pages", pagename + ".raw")
217	ensure_dir(target)
218	with codecs.open(target, "wb", encoding="utf-8") as handle:
219	handle.write(pagedata)
220
221	for locale, value in strings.iteritems():
222	if value:
223	localefile = os.path.join(output_dir, "locales", locale, pagename + ".json")
224	save_locale(localefile, value)
225
226	def process_image(path):
227	if path.startswith("en/"):
228	target = os.path.join(output_dir, "locales", os.path.dirname(path), os.path.basename(path).replace("image!", ""))
229	else:
230	target = os.path.join(output_dir, "static", os.path.dirname(path), os.path.basename(path).replace("image!", ""))
231	with open(path, "rb") as handle:
232	data = handle.read()
233
234	ensure_dir(target)
235	with open(target, "wb") as handle:
236	handle.write(data)
237
238	def process_file(path, menu):
239	if os.path.basename(path) in ("page!footer", "page!internet-explorer", "page!contribute-old"):
240	return
241
242	if os.path.basename(path).startswith("page!"):
243	process_page(path, menu)
244	elif os.path.basename(path).startswith("image!"):
245	process_image(path)
246	else:
247	print >>sys.stderr, "Ignoring file %s" % path
248
249	def process(path, menu):
250	if os.path.isfile(path):
251	process_file(path, menu)
252	elif os.path.isdir(path):
253	for filename in os.listdir(path):
254	process(os.path.join(path, filename), menu)
255	else:
256	print >>sys.stderr, "Ignoring file %s" % path
257
258	def process_menu():
259	menu = {}
260
261	menu_format = "%s/_include/menu!menu"
262	footer_format = "%s/_include/page!footer"
263	for locale in locales:
264	menu[locale] = OrderedDict()
265	if os.path.exists(menu_format % locale):
266	data = read_xml(menu_format % locale)
267	items = get_element(data.documentElement, "items")
268	for node in items.childNodes:
269	text = get_text(get_element(node, "mainlink", "anwv", "title", "anwv")).strip()
270	url = get_text(get_element(node, "mainlink", "anwv", "url", "anwv")).strip()
271	if url == "en":
272	string = "installation"
273	elif url.startswith("en/"):
274	string = url.replace("en/", "")
275	elif url == "/languages/":
276	continue # Unused string
277	elif url == "/search/":
278	string = "search"
279	else:
280	raise Exception("Unexpected URL in menu: %s" % url)
281	if text and text.find("[untr]") < 0:
282	menu[locale][string] = {"message": text}
283	if os.path.exists(footer_format % locale):
284	data = read_xml(footer_format % locale)
285	for string, heading in itertools.izip(("resources", "community", "development", "follow-us"), data.getElementsByTagName("h1")):
286	text = get_text(heading).strip()
287	if text and text.find("[untr]") < 0:
288	menu[locale][string] = {"message": text}
289	for link in data.getElementsByTagName("a"):
290	url = link.getAttribute("href").replace("/de/", "")
291	text = get_text(link).strip()
292	if url == "/forum/viewforum.php?f=11":
293	string = "_bugs"
294	elif url.startswith("/"):
295	string = url.strip("/").split("/")[-1]
296	else:
297	string = url
298	if text and text.find("[untr]") < 0:
299	menu[locale][string] = {"message": text}
300	return menu
301
302	menu = process_menu()
303	process("page!en", menu)
304	process("en", menu)
305	process("images", menu)
306
307	for locale, value in menu.iteritems():
308	if "_bugs" in value:
309	value["bugs"] = value["_bugs"]
310	del value["_bugs"]
311	localefile = os.path.join(output_dir, "locales", locale, "menu.json")
312	save_locale(localefile, value)

Download in other formats:

Original Format