| 1 | #!/usr/bin/env python |
|---|
| 2 | # coding: utf-8 |
|---|
| 3 | |
|---|
| 4 | import sys, os, re, json, errno, codecs, itertools |
|---|
| 5 | from collections import OrderedDict |
|---|
| 6 | from xml.dom import minidom, Node |
|---|
| 7 | |
|---|
| 8 | output_dir = "../wwwnew" |
|---|
| 9 | locales = ("ar", "bg", "de", "en", "es", "fr", "he", "hu", "ko", "lt", "nl", "pt_BR", "ru", "sk", "zh_CN", "zh_TW") |
|---|
| 10 | |
|---|
| 11 | def ensure_dir(path): |
|---|
| 12 | try: |
|---|
| 13 | os.makedirs(os.path.dirname(path)) |
|---|
| 14 | except OSError, e: |
|---|
| 15 | if e.errno != errno.EEXIST: |
|---|
| 16 | raise |
|---|
| 17 | |
|---|
| 18 | def read_xml(path): |
|---|
| 19 | with open(path, "rb") as handle: |
|---|
| 20 | xml = handle.read() |
|---|
| 21 | xml = re.sub(r"(?<!&)&(?!#?\w+;|&)", "&", xml) |
|---|
| 22 | xml = xml.replace(' href="en/', ' href="') |
|---|
| 23 | xml = xml.replace(' href="en"', ' href="index"') |
|---|
| 24 | xml = xml.replace(' src="en/', ' src="') |
|---|
| 25 | return minidom.parseString("<!DOCTYPE root [<!ENTITY mdash \"—\"><!ENTITY nbsp \" \">]><root>%s</root>" % xml) |
|---|
| 26 | |
|---|
| 27 | def save_locale(path, data): |
|---|
| 28 | ensure_dir(path) |
|---|
| 29 | with codecs.open(path, "wb", encoding="utf-8") as handle: |
|---|
| 30 | json.dump(data, handle, ensure_ascii=False, indent=2, separators=(',', ': ')) |
|---|
| 31 | |
|---|
| 32 | def get_text(node): |
|---|
| 33 | result = [] |
|---|
| 34 | for child in node.childNodes: |
|---|
| 35 | if child.nodeType != Node.TEXT_NODE: |
|---|
| 36 | print child.tagName |
|---|
| 37 | raise Exception("Unexpected node type %i" % child.nodeType) |
|---|
| 38 | result.append(child.nodeValue) |
|---|
| 39 | return "".join(result) |
|---|
| 40 | |
|---|
| 41 | def get_element(node, tagName, *args): |
|---|
| 42 | for child in node.childNodes: |
|---|
| 43 | if child.nodeType == Node.ELEMENT_NODE and child.tagName == tagName: |
|---|
| 44 | if args: |
|---|
| 45 | return get_element(child, *args) |
|---|
| 46 | else: |
|---|
| 47 | return child |
|---|
| 48 | return None |
|---|
| 49 | |
|---|
| 50 | def merge_children(nodes): |
|---|
| 51 | def is_text(node): |
|---|
| 52 | if node.nodeType == Node.TEXT_NODE: |
|---|
| 53 | return True |
|---|
| 54 | if (node.nodeType == Node.ELEMENT_NODE and |
|---|
| 55 | node.tagName in ("em", "strong") and |
|---|
| 56 | len(node.attributes) == 0 and |
|---|
| 57 | len(node.childNodes) == 1 and |
|---|
| 58 | node.firstChild.nodeType == Node.TEXT_NODE): |
|---|
| 59 | return True |
|---|
| 60 | if (node.nodeType == Node.ELEMENT_NODE and |
|---|
| 61 | node.tagName == "a" and |
|---|
| 62 | len(node.attributes) == 1 and |
|---|
| 63 | node.hasAttribute("href") and |
|---|
| 64 | len(node.childNodes) == 1 and |
|---|
| 65 | node.firstChild.nodeType == Node.TEXT_NODE): |
|---|
| 66 | return True |
|---|
| 67 | return False |
|---|
| 68 | |
|---|
| 69 | def is_empty(node): |
|---|
| 70 | return node.nodeType == Node.TEXT_NODE and not node.nodeValue.strip() |
|---|
| 71 | |
|---|
| 72 | i = 0 |
|---|
| 73 | en = nodes["en"] |
|---|
| 74 | start = None |
|---|
| 75 | for i in range(len(en.childNodes) + 1): |
|---|
| 76 | if start == None: |
|---|
| 77 | if i < len(en.childNodes) and is_text(en.childNodes[i]): |
|---|
| 78 | start = i |
|---|
| 79 | elif i >= len(en.childNodes) or not is_text(en.childNodes[i]): |
|---|
| 80 | end = i - 1 |
|---|
| 81 | while start < end and is_empty(en.childNodes[start]): |
|---|
| 82 | start += 1 |
|---|
| 83 | while start < end and is_empty(en.childNodes[end]): |
|---|
| 84 | end -= 1 |
|---|
| 85 | if start < end: |
|---|
| 86 | for locale, parent in nodes.iteritems(): |
|---|
| 87 | if end < len(parent.childNodes): |
|---|
| 88 | text = [] |
|---|
| 89 | links = [] |
|---|
| 90 | for child in parent.childNodes[start:end+1]: |
|---|
| 91 | if child.nodeType == Node.ELEMENT_NODE and child.tagName == "a": |
|---|
| 92 | links.append(child.getAttribute("href")) |
|---|
| 93 | child.removeAttribute("href") |
|---|
| 94 | text.append(child.toxml()) |
|---|
| 95 | node = parent.ownerDocument.createTextNode("".join(text)) |
|---|
| 96 | node.links = links |
|---|
| 97 | parent.replaceChild(node, parent.childNodes[start]) |
|---|
| 98 | for child in parent.childNodes[start+1:end+1]: |
|---|
| 99 | parent.removeChild(child) |
|---|
| 100 | else: |
|---|
| 101 | while start < len(parent.childNodes): |
|---|
| 102 | parent.removeChild(parent.childNodes[start]) |
|---|
| 103 | i -= end - start |
|---|
| 104 | start = None |
|---|
| 105 | |
|---|
| 106 | def process_body(nodes, strings, counter=1): |
|---|
| 107 | if nodes["en"].nodeType == Node.ELEMENT_NODE: |
|---|
| 108 | if nodes["en"].tagName not in ("style", "script", "fix"): |
|---|
| 109 | merge_children(nodes) |
|---|
| 110 | for i in range(len(nodes["en"].childNodes)): |
|---|
| 111 | new_nodes = {} |
|---|
| 112 | for locale, value in nodes.iteritems(): |
|---|
| 113 | if len(value.childNodes) > i: |
|---|
| 114 | new_nodes[locale] = value.childNodes[i] |
|---|
| 115 | counter = process_body(new_nodes, strings, counter) |
|---|
| 116 | elif nodes["en"].nodeType == Node.TEXT_NODE: |
|---|
| 117 | if nodes["en"].nodeValue.strip(): |
|---|
| 118 | if hasattr(nodes["en"], "links") and len(nodes["en"].links): |
|---|
| 119 | links = "(%s)" % ", ".join(nodes["en"].links) |
|---|
| 120 | else: |
|---|
| 121 | links = "" |
|---|
| 122 | for locale, value in nodes.iteritems(): |
|---|
| 123 | text = value.nodeValue or "" |
|---|
| 124 | pre, text, post = re.search(r"^(\s*)(.*?)(\s*)$", text, re.S).groups() |
|---|
| 125 | if text and text.find("[untr]") < 0: |
|---|
| 126 | strings[locale]["s%i" % counter] = {"message": text} |
|---|
| 127 | value.nodeValue = "%s$s%i%s$%s" % (pre, counter, links, post) |
|---|
| 128 | counter += 1 |
|---|
| 129 | else: |
|---|
| 130 | print >>sys.stderr, "Unexpected node type %i" % nodes["en"].nodeType |
|---|
| 131 | |
|---|
| 132 | return counter |
|---|
| 133 | |
|---|
| 134 | def process_page(path, menu): |
|---|
| 135 | pagename = os.path.join(os.path.dirname(path), os.path.basename(path).replace("page!", "")) |
|---|
| 136 | if "/" not in pagename: |
|---|
| 137 | pagename = os.path.join(pagename, "index") |
|---|
| 138 | format = "page!%s" |
|---|
| 139 | else: |
|---|
| 140 | format = "%s/" + path.split("/", 1)[1] |
|---|
| 141 | pagename = pagename.split("/", 1)[1] |
|---|
| 142 | |
|---|
| 143 | data = {} |
|---|
| 144 | strings = {} |
|---|
| 145 | for locale in locales: |
|---|
| 146 | if not os.path.exists(format % locale): |
|---|
| 147 | continue |
|---|
| 148 | data[locale] = read_xml(format % locale) |
|---|
| 149 | strings[locale] = OrderedDict() |
|---|
| 150 | |
|---|
| 151 | for locale in data.iterkeys(): |
|---|
| 152 | title = get_text(get_element(data[locale].documentElement, "title", "anwv")).strip() |
|---|
| 153 | if title and title.find("[untr]") < 0: |
|---|
| 154 | strings[locale]["title"] = {"message": title} |
|---|
| 155 | |
|---|
| 156 | titlestring = "title" |
|---|
| 157 | if pagename in menu["en"]: |
|---|
| 158 | if menu["en"][pagename]["message"] != strings["en"]["title"]["message"]: |
|---|
| 159 | titlestring = "title-full" |
|---|
| 160 | for locale in locales: |
|---|
| 161 | if locale in strings and "title" in strings[locale]: |
|---|
| 162 | title = strings[locale]["title"] |
|---|
| 163 | if locale in menu and pagename in menu[locale]: |
|---|
| 164 | strings[locale]["title"] = menu[locale][pagename] |
|---|
| 165 | else: |
|---|
| 166 | del strings[locale]["title"] |
|---|
| 167 | strings[locale]["title-full"] = title |
|---|
| 168 | for locale in locales: |
|---|
| 169 | if locale in menu and pagename in locale: |
|---|
| 170 | del menu[locale][pagename] |
|---|
| 171 | |
|---|
| 172 | bodies = {} |
|---|
| 173 | for locale, value in data.iteritems(): |
|---|
| 174 | bodies[locale] = get_element(value.documentElement, "body", "anwv") |
|---|
| 175 | if pagename == "index": |
|---|
| 176 | container = get_element(bodies[locale], "div") |
|---|
| 177 | container.setAttribute("id", "install-container") |
|---|
| 178 | container.setAttribute("class", "{{page}}") |
|---|
| 179 | process_body(bodies, strings) |
|---|
| 180 | |
|---|
| 181 | body = re.sub(r"</?anwv/?>", "", bodies["en"].toxml()) |
|---|
| 182 | head = re.sub(r"</?anwv/?>", "", get_element(data["en"].documentElement, "head", "anwv").toxml()) |
|---|
| 183 | if head: |
|---|
| 184 | pagedata = "<head>%s</head>%s" % (head, body) |
|---|
| 185 | else: |
|---|
| 186 | pagedata = body |
|---|
| 187 | |
|---|
| 188 | pagedata = pagedata.replace("/_override-static/global/global", "") |
|---|
| 189 | pagedata = re.sub(r"</?fix/?>", "", pagedata, flags=re.S) |
|---|
| 190 | |
|---|
| 191 | # <foo><attr name="bar">test</attr> => <foo bar="test"> |
|---|
| 192 | pagedata = re.sub(r'>\s*<attr\s+name="(\w+)">([^"<>]*)</attr\b', r' \1="\2"', pagedata, flags=re.S) |
|---|
| 193 | |
|---|
| 194 | # <script src=""/> => <script src=""></script> |
|---|
| 195 | pagedata = re.sub(r'<((?!link\b|meta\b|br\b|col\b|base\b|img\b|param\b|area\b|hr\b|input\b)([\w:]+)\b[^<>]*)/>', r'<\1></\2>', pagedata, flags=re.S) |
|---|
| 196 | |
|---|
| 197 | # <img src="foo"/> => <img src="foo"> |
|---|
| 198 | pagedata = re.sub(r'\/>', r'>', pagedata) |
|---|
| 199 | |
|---|
| 200 | # <img src="foo">dummy</img> => <img src="foo"> |
|---|
| 201 | pagedata = re.sub(r'<((link|meta|br|col|base|img|param|area|hr|input)\b[^<>]*)>([^<>]*)</\2>', r'<\1>', pagedata, flags=re.S) |
|---|
| 202 | |
|---|
| 203 | if pagename == "index": |
|---|
| 204 | def translate_tag(match): |
|---|
| 205 | return r'{{"%s"|translate(links=[%s])}}' % (match.group(1), '"%s"' % '", "'.join(match.group(2).split(", "))) |
|---|
| 206 | |
|---|
| 207 | pagedata = re.sub(r"\$([\w\-]+)\$", r'{{"\1"|translate}}', pagedata) |
|---|
| 208 | pagedata = re.sub(r"\$([\w\-]+)\((.*?)\)\$", lambda match: translate_tag(match), pagedata) |
|---|
| 209 | pagedata = "noheading=True\n\n%s" % pagedata |
|---|
| 210 | elif titlestring != "title": |
|---|
| 211 | pagedata = "title=%s\n\n%s" % (titlestring, pagedata) |
|---|
| 212 | |
|---|
| 213 | if pagename == "index": |
|---|
| 214 | target = os.path.join(output_dir, "includes", pagename + ".tmpl") |
|---|
| 215 | else: |
|---|
| 216 | target = os.path.join(output_dir, "pages", pagename + ".raw") |
|---|
| 217 | ensure_dir(target) |
|---|
| 218 | with codecs.open(target, "wb", encoding="utf-8") as handle: |
|---|
| 219 | handle.write(pagedata) |
|---|
| 220 | |
|---|
| 221 | for locale, value in strings.iteritems(): |
|---|
| 222 | if value: |
|---|
| 223 | localefile = os.path.join(output_dir, "locales", locale, pagename + ".json") |
|---|
| 224 | save_locale(localefile, value) |
|---|
| 225 | |
|---|
| 226 | def process_image(path): |
|---|
| 227 | if path.startswith("en/"): |
|---|
| 228 | target = os.path.join(output_dir, "locales", os.path.dirname(path), os.path.basename(path).replace("image!", "")) |
|---|
| 229 | else: |
|---|
| 230 | target = os.path.join(output_dir, "static", os.path.dirname(path), os.path.basename(path).replace("image!", "")) |
|---|
| 231 | with open(path, "rb") as handle: |
|---|
| 232 | data = handle.read() |
|---|
| 233 | |
|---|
| 234 | ensure_dir(target) |
|---|
| 235 | with open(target, "wb") as handle: |
|---|
| 236 | handle.write(data) |
|---|
| 237 | |
|---|
| 238 | def process_file(path, menu): |
|---|
| 239 | if os.path.basename(path) in ("page!footer", "page!internet-explorer", "page!contribute-old"): |
|---|
| 240 | return |
|---|
| 241 | |
|---|
| 242 | if os.path.basename(path).startswith("page!"): |
|---|
| 243 | process_page(path, menu) |
|---|
| 244 | elif os.path.basename(path).startswith("image!"): |
|---|
| 245 | process_image(path) |
|---|
| 246 | else: |
|---|
| 247 | print >>sys.stderr, "Ignoring file %s" % path |
|---|
| 248 | |
|---|
| 249 | def process(path, menu): |
|---|
| 250 | if os.path.isfile(path): |
|---|
| 251 | process_file(path, menu) |
|---|
| 252 | elif os.path.isdir(path): |
|---|
| 253 | for filename in os.listdir(path): |
|---|
| 254 | process(os.path.join(path, filename), menu) |
|---|
| 255 | else: |
|---|
| 256 | print >>sys.stderr, "Ignoring file %s" % path |
|---|
| 257 | |
|---|
| 258 | def process_menu(): |
|---|
| 259 | menu = {} |
|---|
| 260 | |
|---|
| 261 | menu_format = "%s/_include/menu!menu" |
|---|
| 262 | footer_format = "%s/_include/page!footer" |
|---|
| 263 | for locale in locales: |
|---|
| 264 | menu[locale] = OrderedDict() |
|---|
| 265 | if os.path.exists(menu_format % locale): |
|---|
| 266 | data = read_xml(menu_format % locale) |
|---|
| 267 | items = get_element(data.documentElement, "items") |
|---|
| 268 | for node in items.childNodes: |
|---|
| 269 | text = get_text(get_element(node, "mainlink", "anwv", "title", "anwv")).strip() |
|---|
| 270 | url = get_text(get_element(node, "mainlink", "anwv", "url", "anwv")).strip() |
|---|
| 271 | if url == "en": |
|---|
| 272 | string = "installation" |
|---|
| 273 | elif url.startswith("en/"): |
|---|
| 274 | string = url.replace("en/", "") |
|---|
| 275 | elif url == "/languages/": |
|---|
| 276 | continue # Unused string |
|---|
| 277 | elif url == "/search/": |
|---|
| 278 | string = "search" |
|---|
| 279 | else: |
|---|
| 280 | raise Exception("Unexpected URL in menu: %s" % url) |
|---|
| 281 | if text and text.find("[untr]") < 0: |
|---|
| 282 | menu[locale][string] = {"message": text} |
|---|
| 283 | if os.path.exists(footer_format % locale): |
|---|
| 284 | data = read_xml(footer_format % locale) |
|---|
| 285 | for string, heading in itertools.izip(("resources", "community", "development", "follow-us"), data.getElementsByTagName("h1")): |
|---|
| 286 | text = get_text(heading).strip() |
|---|
| 287 | if text and text.find("[untr]") < 0: |
|---|
| 288 | menu[locale][string] = {"message": text} |
|---|
| 289 | for link in data.getElementsByTagName("a"): |
|---|
| 290 | url = link.getAttribute("href").replace("/de/", "") |
|---|
| 291 | text = get_text(link).strip() |
|---|
| 292 | if url == "/forum/viewforum.php?f=11": |
|---|
| 293 | string = "_bugs" |
|---|
| 294 | elif url.startswith("/"): |
|---|
| 295 | string = url.strip("/").split("/")[-1] |
|---|
| 296 | else: |
|---|
| 297 | string = url |
|---|
| 298 | if text and text.find("[untr]") < 0: |
|---|
| 299 | menu[locale][string] = {"message": text} |
|---|
| 300 | return menu |
|---|
| 301 | |
|---|
| 302 | menu = process_menu() |
|---|
| 303 | process("page!en", menu) |
|---|
| 304 | process("en", menu) |
|---|
| 305 | process("images", menu) |
|---|
| 306 | |
|---|
| 307 | for locale, value in menu.iteritems(): |
|---|
| 308 | if "_bugs" in value: |
|---|
| 309 | value["bugs"] = value["_bugs"] |
|---|
| 310 | del value["_bugs"] |
|---|
| 311 | localefile = os.path.join(output_dir, "locales", locale, "menu.json") |
|---|
| 312 | save_locale(localefile, value) |
|---|