Ticket #1170: convert.py

File convert.py, 11.2 KB (added by trev, on 08/01/2014 at 10:59:53 AM)

Conversion script

Line 
1#!/usr/bin/env python
2# coding: utf-8
3
4import sys, os, re, json, errno, codecs, itertools
5from collections import OrderedDict
6from xml.dom import minidom, Node
7
8output_dir = "../wwwnew"
9locales = ("ar", "bg", "de", "en", "es", "fr", "he", "hu", "ko", "lt", "nl", "pt_BR", "ru", "sk", "zh_CN", "zh_TW")
10
11def ensure_dir(path):
12  try:
13    os.makedirs(os.path.dirname(path))
14  except OSError, e:
15    if e.errno != errno.EEXIST:
16      raise
17
18def read_xml(path):
19  with open(path, "rb") as handle:
20    xml = handle.read()
21    xml = re.sub(r"(?<!&)&(?!#?\w+;|&)", "&amp;", xml)
22    xml = xml.replace(' href="en/', ' href="')
23    xml = xml.replace(' href="en"', ' href="index"')
24    xml = xml.replace(' src="en/', ' src="')
25    return minidom.parseString("<!DOCTYPE root [<!ENTITY mdash \"&#8212;\"><!ENTITY nbsp \"&#xA0;\">]><root>%s</root>" % xml)
26
27def save_locale(path, data):
28  ensure_dir(path)
29  with codecs.open(path, "wb", encoding="utf-8") as handle:
30    json.dump(data, handle, ensure_ascii=False, indent=2, separators=(',', ': '))
31
32def get_text(node):
33  result = []
34  for child in node.childNodes:
35    if child.nodeType != Node.TEXT_NODE:
36      print child.tagName
37      raise Exception("Unexpected node type %i" % child.nodeType)
38    result.append(child.nodeValue)
39  return "".join(result)
40
41def get_element(node, tagName, *args):
42  for child in node.childNodes:
43    if child.nodeType == Node.ELEMENT_NODE and child.tagName == tagName:
44      if args:
45        return get_element(child, *args)
46      else:
47        return child
48  return None
49
50def merge_children(nodes):
51  def is_text(node):
52    if node.nodeType == Node.TEXT_NODE:
53      return True
54    if (node.nodeType == Node.ELEMENT_NODE and
55        node.tagName in ("em", "strong") and
56        len(node.attributes) == 0 and
57        len(node.childNodes) == 1 and
58        node.firstChild.nodeType == Node.TEXT_NODE):
59      return True
60    if (node.nodeType == Node.ELEMENT_NODE and
61        node.tagName == "a" and
62        len(node.attributes) == 1 and
63        node.hasAttribute("href") and
64        len(node.childNodes) == 1 and
65        node.firstChild.nodeType == Node.TEXT_NODE):
66      return True
67    return False
68
69  def is_empty(node):
70    return node.nodeType == Node.TEXT_NODE and not node.nodeValue.strip()
71
72  i = 0
73  en = nodes["en"]
74  start = None
75  for i in range(len(en.childNodes) + 1):
76    if start == None:
77      if i < len(en.childNodes) and is_text(en.childNodes[i]):
78        start = i
79    elif i >= len(en.childNodes) or not is_text(en.childNodes[i]):
80      end = i - 1
81      while start < end and is_empty(en.childNodes[start]):
82        start += 1
83      while start < end and is_empty(en.childNodes[end]):
84        end -= 1
85      if start < end:
86        for locale, parent in nodes.iteritems():
87          if end < len(parent.childNodes):
88            text = []
89            links = []
90            for child in parent.childNodes[start:end+1]:
91              if child.nodeType == Node.ELEMENT_NODE and child.tagName == "a":
92                links.append(child.getAttribute("href"))
93                child.removeAttribute("href")
94              text.append(child.toxml())
95            node = parent.ownerDocument.createTextNode("".join(text))
96            node.links = links
97            parent.replaceChild(node, parent.childNodes[start])
98            for child in parent.childNodes[start+1:end+1]:
99              parent.removeChild(child)
100          else:
101            while start < len(parent.childNodes):
102              parent.removeChild(parent.childNodes[start])
103        i -= end - start
104      start = None
105
106def process_body(nodes, strings, counter=1):
107  if nodes["en"].nodeType == Node.ELEMENT_NODE:
108    if nodes["en"].tagName not in ("style", "script", "fix"):
109      merge_children(nodes)
110      for i in range(len(nodes["en"].childNodes)):
111        new_nodes = {}
112        for locale, value in nodes.iteritems():
113          if len(value.childNodes) > i:
114            new_nodes[locale] = value.childNodes[i]
115        counter = process_body(new_nodes, strings, counter)
116  elif nodes["en"].nodeType == Node.TEXT_NODE:
117    if nodes["en"].nodeValue.strip():
118      if hasattr(nodes["en"], "links") and len(nodes["en"].links):
119        links = "(%s)" % ", ".join(nodes["en"].links)
120      else:
121        links = ""
122      for locale, value in nodes.iteritems():
123        text = value.nodeValue or ""
124        pre, text, post = re.search(r"^(\s*)(.*?)(\s*)$", text, re.S).groups()
125        if text and text.find("[untr]") < 0:
126          strings[locale]["s%i" % counter] = {"message": text}
127        value.nodeValue = "%s$s%i%s$%s" % (pre, counter, links, post)
128      counter += 1
129  else:
130    print >>sys.stderr, "Unexpected node type %i" % nodes["en"].nodeType
131
132  return counter
133
134def process_page(path, menu):
135  pagename = os.path.join(os.path.dirname(path), os.path.basename(path).replace("page!", ""))
136  if "/" not in pagename:
137    pagename = os.path.join(pagename, "index")
138    format = "page!%s"
139  else:
140    format = "%s/" + path.split("/", 1)[1]
141  pagename = pagename.split("/", 1)[1]
142
143  data = {}
144  strings = {}
145  for locale in locales:
146    if not os.path.exists(format % locale):
147      continue
148    data[locale] = read_xml(format % locale)
149    strings[locale] = OrderedDict()
150
151  for locale in data.iterkeys():
152    title = get_text(get_element(data[locale].documentElement, "title", "anwv")).strip()
153    if title and title.find("[untr]") < 0:
154      strings[locale]["title"] = {"message": title}
155
156  titlestring = "title"
157  if pagename in menu["en"]:
158    if menu["en"][pagename]["message"] != strings["en"]["title"]["message"]:
159      titlestring = "title-full"
160      for locale in locales:
161        if locale in strings and "title" in strings[locale]:
162          title = strings[locale]["title"]
163          if locale in menu and pagename in menu[locale]:
164            strings[locale]["title"] = menu[locale][pagename]
165          else:
166            del strings[locale]["title"]
167          strings[locale]["title-full"] = title
168    for locale in locales:
169      if locale in menu and pagename in locale:
170        del menu[locale][pagename]
171
172  bodies = {}
173  for locale, value in data.iteritems():
174    bodies[locale] = get_element(value.documentElement, "body", "anwv")
175    if pagename == "index":
176      container = get_element(bodies[locale], "div")
177      container.setAttribute("id", "install-container")
178      container.setAttribute("class", "{{page}}")
179  process_body(bodies, strings)
180
181  body = re.sub(r"</?anwv/?>", "", bodies["en"].toxml())
182  head = re.sub(r"</?anwv/?>", "", get_element(data["en"].documentElement, "head", "anwv").toxml())
183  if head:
184    pagedata = "<head>%s</head>%s" % (head, body)
185  else:
186    pagedata = body
187
188  pagedata = pagedata.replace("/_override-static/global/global", "")
189  pagedata = re.sub(r"</?fix/?>", "", pagedata, flags=re.S)
190
191  # <foo><attr name="bar">test</attr> => <foo bar="test">
192  pagedata = re.sub(r'>\s*<attr\s+name="(\w+)">([^"<>]*)</attr\b', r' \1="\2"', pagedata, flags=re.S)
193
194  # <script src=""/> => <script src=""></script>
195  pagedata = re.sub(r'<((?!link\b|meta\b|br\b|col\b|base\b|img\b|param\b|area\b|hr\b|input\b)([\w:]+)\b[^<>]*)/>', r'<\1></\2>', pagedata, flags=re.S)
196
197  # <img src="foo"/> => <img src="foo">
198  pagedata = re.sub(r'\/>', r'>', pagedata)
199
200  # <img src="foo">dummy</img> => <img src="foo">
201  pagedata = re.sub(r'<((link|meta|br|col|base|img|param|area|hr|input)\b[^<>]*)>([^<>]*)</\2>', r'<\1>', pagedata, flags=re.S)
202
203  if pagename == "index":
204    def translate_tag(match):
205      return r'{{"%s"|translate(links=[%s])}}' % (match.group(1), '"%s"' % '", "'.join(match.group(2).split(", ")))
206
207    pagedata = re.sub(r"\$([\w\-]+)\$", r'{{"\1"|translate}}', pagedata)
208    pagedata = re.sub(r"\$([\w\-]+)\((.*?)\)\$", lambda match: translate_tag(match), pagedata)
209    pagedata = "noheading=True\n\n%s" % pagedata
210  elif titlestring != "title":
211    pagedata = "title=%s\n\n%s" % (titlestring, pagedata)
212
213  if pagename == "index":
214    target = os.path.join(output_dir, "includes", pagename + ".tmpl")
215  else:
216    target = os.path.join(output_dir, "pages", pagename + ".raw")
217  ensure_dir(target)
218  with codecs.open(target, "wb", encoding="utf-8") as handle:
219    handle.write(pagedata)
220
221  for locale, value in strings.iteritems():
222    if value:
223      localefile = os.path.join(output_dir, "locales", locale, pagename + ".json")
224      save_locale(localefile, value)
225
226def process_image(path):
227  if path.startswith("en/"):
228    target = os.path.join(output_dir, "locales", os.path.dirname(path), os.path.basename(path).replace("image!", ""))
229  else:
230    target = os.path.join(output_dir, "static", os.path.dirname(path), os.path.basename(path).replace("image!", ""))
231  with open(path, "rb") as handle:
232    data = handle.read()
233
234  ensure_dir(target)
235  with open(target, "wb") as handle:
236    handle.write(data)
237
238def process_file(path, menu):
239  if os.path.basename(path) in ("page!footer", "page!internet-explorer", "page!contribute-old"):
240    return
241
242  if os.path.basename(path).startswith("page!"):
243    process_page(path, menu)
244  elif os.path.basename(path).startswith("image!"):
245    process_image(path)
246  else:
247    print >>sys.stderr, "Ignoring file %s" % path
248
249def process(path, menu):
250  if os.path.isfile(path):
251    process_file(path, menu)
252  elif os.path.isdir(path):
253    for filename in os.listdir(path):
254      process(os.path.join(path, filename), menu)
255  else:
256    print >>sys.stderr, "Ignoring file %s" % path
257
258def process_menu():
259  menu = {}
260
261  menu_format = "%s/_include/menu!menu"
262  footer_format = "%s/_include/page!footer"
263  for locale in locales:
264    menu[locale] = OrderedDict()
265    if os.path.exists(menu_format % locale):
266      data = read_xml(menu_format % locale)
267      items = get_element(data.documentElement, "items")
268      for node in items.childNodes:
269        text = get_text(get_element(node, "mainlink", "anwv", "title", "anwv")).strip()
270        url = get_text(get_element(node, "mainlink", "anwv", "url", "anwv")).strip()
271        if url == "en":
272          string = "installation"
273        elif url.startswith("en/"):
274          string = url.replace("en/", "")
275        elif url == "/languages/":
276          continue    # Unused string
277        elif url == "/search/":
278          string = "search"
279        else:
280          raise Exception("Unexpected URL in menu: %s" % url)
281        if text and text.find("[untr]") < 0:
282          menu[locale][string] = {"message": text}
283    if os.path.exists(footer_format % locale):
284      data = read_xml(footer_format % locale)
285      for string, heading in itertools.izip(("resources", "community", "development", "follow-us"), data.getElementsByTagName("h1")):
286        text = get_text(heading).strip()
287        if text and text.find("[untr]") < 0:
288          menu[locale][string] = {"message": text}
289      for link in data.getElementsByTagName("a"):
290        url = link.getAttribute("href").replace("/de/", "")
291        text = get_text(link).strip()
292        if url == "/forum/viewforum.php?f=11":
293          string = "_bugs"
294        elif url.startswith("/"):
295          string = url.strip("/").split("/")[-1]
296        else:
297          string = url
298        if text and text.find("[untr]") < 0:
299          menu[locale][string] = {"message": text}
300  return menu
301
302menu = process_menu()
303process("page!en", menu)
304process("en", menu)
305process("images", menu)
306
307for locale, value in menu.iteritems():
308  if "_bugs" in value:
309    value["bugs"] = value["_bugs"]
310    del value["_bugs"]
311  localefile = os.path.join(output_dir, "locales", locale, "menu.json")
312  save_locale(localefile, value)