From 621c65caf24ad55da6d741d4241e283946f8f195 Mon Sep 17 00:00:00 2001
From: Tom Morris <tfmorris@gmail.com>
Date: Fri, 21 Dec 2012 18:15:01 -0500
Subject: [PATCH 01/16] Improve error message and add TODOs

---
 wikiconvert.py        |  10 ++-
 wikiconvert_creole.py | 172 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 178 insertions(+), 4 deletions(-)
 create mode 100644 wikiconvert_creole.py

diff --git a/wikiconvert.py b/wikiconvert.py
index d09705d..683a625 100644
--- a/wikiconvert.py
+++ b/wikiconvert.py
@@ -39,10 +39,9 @@ def convert_file(proj_id, src_path, dst_dir):
         if line.startswith("#"):
             meta_lines.append(line)
         else:
-            if not line.strip():
-                body_lines = lines[i+1:]
-            else:
-                body_lines = lines[i:]
+            assert not line.strip(), "line isn't empty in file %s %r" % (src_path, line)
+            # TODO is it actually mandatory that a blank line separate meta text from body text?
+            body_lines = lines[i+1:]
             break
     meta = {}
     for line in meta_lines:
@@ -58,6 +57,8 @@ def convert_file(proj_id, src_path, dst_dir):
     text = re.compile(r'^{{{+ *\n', re.M).sub(r"```\n", text)
     text = re.compile(r'^}}}+ *(\n|$)', re.M).sub(r"```\n", text)
 
+    # TODO: Add support for `backtick` code quotes
+    
     # Headings.
     text = re.compile(r'^===(.*?)===\s*$', re.M).sub(lambda m: "### %s\n"%m.group(1).strip(), text)
     text = re.compile(r'^==(.*?)==\s*$', re.M).sub(lambda m: "## %s\n"%m.group(1).strip(), text)
@@ -134,6 +135,7 @@ def _indent(text):
 
 def _gh_page_name_from_gc_page_name(gc):
     """Github (gh) Wiki page name from Google Code (gc) Wiki page name."""
+    # FIXME: fails on all uppercase / all lowercase names (e.g. FAQ)
     gh = re.sub(r'([A-Z][a-z]+)', r'-\1', gc)[1:]
     return gh
 
diff --git a/wikiconvert_creole.py b/wikiconvert_creole.py
new file mode 100644
index 0000000..8594a36
--- /dev/null
+++ b/wikiconvert_creole.py
@@ -0,0 +1,172 @@
+#!/usr/bin/env python
+
+"""
+Usage:
+    python googlecode2github/wikiconfig.py PROJID SRCDIR DSTDIR
+
+where "PROJID" is the github project id, e.g. "trentm/python-markdown2",
+"SRCDIR" is a Google Code project wiki Subversion working copy dir and
+"DSTDIR" is the git clone dir of the git project's wiki.
+"""
+
+__version__ = "1.0.0"
+
+import re
+import sys
+from os.path import *
+from glob import glob
+from pprint import pprint
+import codecs
+from hashlib import md5
+
+
+def log(s):
+    sys.stderr.write(s+"\n")
+
+def convert_dir(proj_id, src_dir, dst_dir):
+    if isfile(src_dir):
+        convert_file(proj_id, src_dir, dst_dir)
+    else:
+        for f in glob(join(src_dir, "*.wiki")):
+            convert_file(proj_id, f, dst_dir)
+
+def convert_file(proj_id, src_path, dst_dir):
+    src = codecs.open(src_path, 'r', 'utf-8').read()
+    meta_lines = []
+    body_lines = []
+    lines = src.splitlines(False)
+    for i, line in enumerate(lines):
+        if line.startswith("#"):
+            meta_lines.append(line)
+        else:
+            assert not line.strip(), "line isn't empty in file %s %r" % (src_path, line)
+            # TODO is it actually mandtory that a blank line separate meta text from body text?
+            body_lines = lines[i+1:]
+            break
+    meta = {}
+    for line in meta_lines:
+        k,v = line[1:].split(None, 1)
+        meta[k] = v
+    text = '\n'.join(body_lines)
+    s_from_hash = {}
+
+    # Pull out pre-blocks so we can restore them unmunged
+    def sub_block(match,indent=True):
+        pre = match.group(1)
+        hash = md5(pre.encode('utf8')).hexdigest()
+        s_from_hash[hash] = _indent(pre) if indent else pre
+        return hash
+    def sub_pre_block(match):
+        return sub_block(match,indent=True)
+    # TODO not sure newline is correct after opening braces {{{code}}}
+    text = re.compile(r'^{{{\n(.*?)^}}}', re.M|re.S).sub(sub_pre_block, text)
+
+    #  Pull out `backtick` code quotes 
+    #def sub_code(match)
+     #   return sub_block(match,indent=False)
+    text = re.compile(r'^`(.*?)^`', re.M|re.S).sub(r'##{{{\1}}}##', text), text) # monospace literal for Creole 
+    
+    # Headings - No conversion needed for Creole. Markdown conversion below
+    #text = re.compile(r'^===(.*?)===\s*$', re.M).sub(lambda m: "### %s\n"%m.group(1).strip(), text)
+    #text = re.compile(r'^==(.*?)==\s*$', re.M).sub(lambda m: "## %s\n"%m.group(1).strip(), text)
+    #text = re.compile(r'^=(.*?)=\s*$', re.M).sub(lambda m: "# %s\n"%m.group(1).strip(), text)
+
+    # Tables
+    def sub_table_html(m):
+        rows = []
+        for line in m.group(0).splitlines(False):
+            if not line.strip():
+                continue
+            rows.append(list(c.strip() for c in line.split("||")[1:-1]))
+        lines = ['<table>']
+        for row in rows:
+            lines.append('  <tr>%s</tr>' % ''.join('<td>%s</td>' % c for c in row))
+        lines.append('</table>')
+        return '\n\n' + '\n'.join(lines)
+    def sub_table_creole(m):
+        rows = []
+        for line in m.group(0).splitlines(False):
+            if not line.strip():
+                continue
+            rows.append(list(c.strip() for c in line.split("||")[1:-1]))
+        lines = []
+        # Assume first row is a header (or should we assume the reverse?)
+        lines.append('|='.join(row[0]))[:-1] # skip trailing equal sign
+        for row in rows[1:]:
+            lines.append('|'.join(row))
+        return '\n\n' + '\n'.join(lines)
+    text = re.compile(r'\n(\n^\|\|(.*?\|\|)+$)+', re.M).sub(sub_table, text)
+
+    # Lists (doesn't handle nested lists).
+    # TODO: leave bullet marker unchanged for *, -, +
+    text = re.compile(r'^[ \t]+\*[ \t]+(.*?)[ \t]*$', re.M).sub(r'* \1', text)
+    text = re.compile(r'^[ \t]+#[ \t]+(.*?)[ \t]*$', re.M).sub(r'1. \1', text)
+
+    # wiki links. - Creole & Markdown are the same - no change required to conversion
+    def sub_wikilink(m):
+        gh_page_name = _gh_page_name_from_gc_page_name(m.group(1)).replace('-', ' ')
+        if m.group(2):
+            s = "[[%s|%s]]" % (gh_page_name, m.group(2))
+            pass
+        else:
+            s = "[[%s]]" % gh_page_name
+        hash = md5(s.encode('utf8')).hexdigest()
+        s_from_hash[hash] = s
+        return hash
+    text = re.compile(r'\[((?:[A-Z][a-z]+)+)(?:\s+(.*?))?\]', re.S).sub(sub_wikilink, text)
+
+    # Links
+    def sub_link(m):
+        # s = "[%s](%s)" % (m.group(2), m.group(1)) # Markdown
+        s = "[[%s|%s]]" % (m.group(1), m.group(2)) # Creole
+        hash = md5(s.encode('utf8')).hexdigest()
+        s_from_hash[hash] = s
+        return hash
+    text = re.compile(r'(?<!\[)\[([^\s]+)\s+(.*?)\](?!\])', re.S).sub(sub_link, text)
+
+    # Italics, bold. - same for both Markdown & Creole
+    # in*ter*bold: (?<=\w)(\*\w+?\*)(?=\w)
+    text = re.compile(r'(?<![*\w])\*([^*]+?)\*(?![*\w])', re.S).sub(r'**\1**', text)
+    text = re.compile(r'(?<![_\w])_([^_]+?)_(?![_\w])', re.S).sub(r'*\1*', text)
+
+    # Auto-linking "issue \d+"
+    # TODO: Construct Google Code -> Github issue lookup map
+    text = re.compile(r'(?<!\[)(issue (\d+))(?!\])').sub(
+        r'[\1](https://github.com/%s/issues#issue/\2)' % proj_id, text)
+
+    # Restore hashed-out blocks.
+    for hash, s in s_from_hash.items():
+        text = text.replace(hash, s)
+
+    # Add summary.
+    if "summary" in meta:
+        text = ("# %s\n\n" % meta["summary"]) + text
+
+    base = splitext(basename(src_path))[0]
+    gh_page_name = _gh_page_name_from_gc_page_name(base)
+    dst_path = join(dst_dir, gh_page_name+".md")
+    if not exists(dst_path) or codecs.open(dst_path, 'r', 'utf-8').read() != text:
+        codecs.open(dst_path, 'w', 'utf-8').write(text)
+        log("wrote '%s'" % dst_path)
+
+
+#---- internal support stuff
+
+def _indent(text):
+    return '    ' + '\n    '.join(text.splitlines(False))
+
+def _gh_page_name_from_gc_page_name(gc):
+    """Github (gh) Wiki page name from Google Code (gc) Wiki page name."""
+    # FIXME: fails on all uppercase / all lowercase names (e.g. FAQ)
+    gh = re.sub(r'([A-Z][a-z]+)', r'-\1', gc)[1:]
+    return gh
+
+
+#---- mainline
+
+if __name__ == '__main__':
+    convert_dir("OpenRefine/OpenRefine", "c:/users/tfmorris/tmp/grefine-wiki", "c:/users/tfmorris/tmp/orefine-wiki")
+    if len(sys.argv) != 4:
+        print __doc__
+        sys.exit(1)
+    convert_dir(sys.argv[1], sys.argv[2], sys.argv[3])

From 9b78a91ad2ae4d84161dd71b18a40c6ea211c6a4 Mon Sep 17 00:00:00 2001
From: Tom Morris <tfmorris@gmail.com>
Date: Fri, 21 Dec 2012 18:19:02 -0500
Subject: [PATCH 02/16] First pass

As used for first pass of automatic conversion.  Still needs work
---
 wikiconvert_creole.py | 347 +++++++++++++++++++++---------------------
 1 file changed, 175 insertions(+), 172 deletions(-)

diff --git a/wikiconvert_creole.py b/wikiconvert_creole.py
index 8594a36..fa5a6ae 100644
--- a/wikiconvert_creole.py
+++ b/wikiconvert_creole.py
@@ -1,172 +1,175 @@
-#!/usr/bin/env python
-
-"""
-Usage:
-    python googlecode2github/wikiconfig.py PROJID SRCDIR DSTDIR
-
-where "PROJID" is the github project id, e.g. "trentm/python-markdown2",
-"SRCDIR" is a Google Code project wiki Subversion working copy dir and
-"DSTDIR" is the git clone dir of the git project's wiki.
-"""
-
-__version__ = "1.0.0"
-
-import re
-import sys
-from os.path import *
-from glob import glob
-from pprint import pprint
-import codecs
-from hashlib import md5
-
-
-def log(s):
-    sys.stderr.write(s+"\n")
-
-def convert_dir(proj_id, src_dir, dst_dir):
-    if isfile(src_dir):
-        convert_file(proj_id, src_dir, dst_dir)
-    else:
-        for f in glob(join(src_dir, "*.wiki")):
-            convert_file(proj_id, f, dst_dir)
-
-def convert_file(proj_id, src_path, dst_dir):
-    src = codecs.open(src_path, 'r', 'utf-8').read()
-    meta_lines = []
-    body_lines = []
-    lines = src.splitlines(False)
-    for i, line in enumerate(lines):
-        if line.startswith("#"):
-            meta_lines.append(line)
-        else:
-            assert not line.strip(), "line isn't empty in file %s %r" % (src_path, line)
-            # TODO is it actually mandtory that a blank line separate meta text from body text?
-            body_lines = lines[i+1:]
-            break
-    meta = {}
-    for line in meta_lines:
-        k,v = line[1:].split(None, 1)
-        meta[k] = v
-    text = '\n'.join(body_lines)
-    s_from_hash = {}
-
-    # Pull out pre-blocks so we can restore them unmunged
-    def sub_block(match,indent=True):
-        pre = match.group(1)
-        hash = md5(pre.encode('utf8')).hexdigest()
-        s_from_hash[hash] = _indent(pre) if indent else pre
-        return hash
-    def sub_pre_block(match):
-        return sub_block(match,indent=True)
-    # TODO not sure newline is correct after opening braces {{{code}}}
-    text = re.compile(r'^{{{\n(.*?)^}}}', re.M|re.S).sub(sub_pre_block, text)
-
-    #  Pull out `backtick` code quotes 
-    #def sub_code(match)
-     #   return sub_block(match,indent=False)
-    text = re.compile(r'^`(.*?)^`', re.M|re.S).sub(r'##{{{\1}}}##', text), text) # monospace literal for Creole 
-    
-    # Headings - No conversion needed for Creole. Markdown conversion below
-    #text = re.compile(r'^===(.*?)===\s*$', re.M).sub(lambda m: "### %s\n"%m.group(1).strip(), text)
-    #text = re.compile(r'^==(.*?)==\s*$', re.M).sub(lambda m: "## %s\n"%m.group(1).strip(), text)
-    #text = re.compile(r'^=(.*?)=\s*$', re.M).sub(lambda m: "# %s\n"%m.group(1).strip(), text)
-
-    # Tables
-    def sub_table_html(m):
-        rows = []
-        for line in m.group(0).splitlines(False):
-            if not line.strip():
-                continue
-            rows.append(list(c.strip() for c in line.split("||")[1:-1]))
-        lines = ['<table>']
-        for row in rows:
-            lines.append('  <tr>%s</tr>' % ''.join('<td>%s</td>' % c for c in row))
-        lines.append('</table>')
-        return '\n\n' + '\n'.join(lines)
-    def sub_table_creole(m):
-        rows = []
-        for line in m.group(0).splitlines(False):
-            if not line.strip():
-                continue
-            rows.append(list(c.strip() for c in line.split("||")[1:-1]))
-        lines = []
-        # Assume first row is a header (or should we assume the reverse?)
-        lines.append('|='.join(row[0]))[:-1] # skip trailing equal sign
-        for row in rows[1:]:
-            lines.append('|'.join(row))
-        return '\n\n' + '\n'.join(lines)
-    text = re.compile(r'\n(\n^\|\|(.*?\|\|)+$)+', re.M).sub(sub_table, text)
-
-    # Lists (doesn't handle nested lists).
-    # TODO: leave bullet marker unchanged for *, -, +
-    text = re.compile(r'^[ \t]+\*[ \t]+(.*?)[ \t]*$', re.M).sub(r'* \1', text)
-    text = re.compile(r'^[ \t]+#[ \t]+(.*?)[ \t]*$', re.M).sub(r'1. \1', text)
-
-    # wiki links. - Creole & Markdown are the same - no change required to conversion
-    def sub_wikilink(m):
-        gh_page_name = _gh_page_name_from_gc_page_name(m.group(1)).replace('-', ' ')
-        if m.group(2):
-            s = "[[%s|%s]]" % (gh_page_name, m.group(2))
-            pass
-        else:
-            s = "[[%s]]" % gh_page_name
-        hash = md5(s.encode('utf8')).hexdigest()
-        s_from_hash[hash] = s
-        return hash
-    text = re.compile(r'\[((?:[A-Z][a-z]+)+)(?:\s+(.*?))?\]', re.S).sub(sub_wikilink, text)
-
-    # Links
-    def sub_link(m):
-        # s = "[%s](%s)" % (m.group(2), m.group(1)) # Markdown
-        s = "[[%s|%s]]" % (m.group(1), m.group(2)) # Creole
-        hash = md5(s.encode('utf8')).hexdigest()
-        s_from_hash[hash] = s
-        return hash
-    text = re.compile(r'(?<!\[)\[([^\s]+)\s+(.*?)\](?!\])', re.S).sub(sub_link, text)
-
-    # Italics, bold. - same for both Markdown & Creole
-    # in*ter*bold: (?<=\w)(\*\w+?\*)(?=\w)
-    text = re.compile(r'(?<![*\w])\*([^*]+?)\*(?![*\w])', re.S).sub(r'**\1**', text)
-    text = re.compile(r'(?<![_\w])_([^_]+?)_(?![_\w])', re.S).sub(r'*\1*', text)
-
-    # Auto-linking "issue \d+"
-    # TODO: Construct Google Code -> Github issue lookup map
-    text = re.compile(r'(?<!\[)(issue (\d+))(?!\])').sub(
-        r'[\1](https://github.com/%s/issues#issue/\2)' % proj_id, text)
-
-    # Restore hashed-out blocks.
-    for hash, s in s_from_hash.items():
-        text = text.replace(hash, s)
-
-    # Add summary.
-    if "summary" in meta:
-        text = ("# %s\n\n" % meta["summary"]) + text
-
-    base = splitext(basename(src_path))[0]
-    gh_page_name = _gh_page_name_from_gc_page_name(base)
-    dst_path = join(dst_dir, gh_page_name+".md")
-    if not exists(dst_path) or codecs.open(dst_path, 'r', 'utf-8').read() != text:
-        codecs.open(dst_path, 'w', 'utf-8').write(text)
-        log("wrote '%s'" % dst_path)
-
-
-#---- internal support stuff
-
-def _indent(text):
-    return '    ' + '\n    '.join(text.splitlines(False))
-
-def _gh_page_name_from_gc_page_name(gc):
-    """Github (gh) Wiki page name from Google Code (gc) Wiki page name."""
-    # FIXME: fails on all uppercase / all lowercase names (e.g. FAQ)
-    gh = re.sub(r'([A-Z][a-z]+)', r'-\1', gc)[1:]
-    return gh
-
-
-#---- mainline
-
-if __name__ == '__main__':
-    convert_dir("OpenRefine/OpenRefine", "c:/users/tfmorris/tmp/grefine-wiki", "c:/users/tfmorris/tmp/orefine-wiki")
-    if len(sys.argv) != 4:
-        print __doc__
-        sys.exit(1)
-    convert_dir(sys.argv[1], sys.argv[2], sys.argv[3])
+#!/usr/bin/env python
+
+"""
+Usage:
+    python googlecode2github/wikiconfig.py PROJID SRCDIR DSTDIR
+
+where "PROJID" is the github project id, e.g. "trentm/python-markdown2",
+"SRCDIR" is a Google Code project wiki Subversion working copy dir and
+"DSTDIR" is the git clone dir of the git project's wiki.
+"""
+
+__version__ = "1.0.0"
+
+import re
+import sys
+from os.path import *
+from glob import glob
+from pprint import pprint
+import codecs
+from hashlib import md5
+
+
+def log(s):
+    sys.stderr.write(s+"\n")
+
+def convert_dir(proj_id, src_dir, dst_dir):
+    if isfile(src_dir):
+        convert_file(proj_id, src_dir, dst_dir)
+    else:
+        for f in glob(join(src_dir, "*.wiki")):
+            convert_file(proj_id, f, dst_dir)
+
+def convert_file(proj_id, src_path, dst_dir):
+    src = codecs.open(src_path, 'r', 'utf-8').read()
+    meta_lines = []
+    body_lines = []
+    lines = src.splitlines(False)
+    for i, line in enumerate(lines):
+        if line.startswith("#"):
+            meta_lines.append(line)
+        else:
+            assert not line.strip(), "line isn't empty in file %s %r" % (src_path, line)
+            # TODO is it actually mandtory that a blank line separate meta text from body text?
+            body_lines = lines[i+1:]
+            break
+    meta = {}
+    for line in meta_lines:
+        k,v = line[1:].split(None, 1)
+        meta[k] = v
+    text = '\n'.join(body_lines)
+    s_from_hash = {}
+
+    # Pull out pre-blocks so we can restore them unmunged
+    def sub_block(match,indent=True):
+        pre = match.group(1)
+        hash = md5(pre.encode('utf8')).hexdigest()
+        s_from_hash[hash] = _indent(pre) if indent else pre
+        return hash
+    def sub_pre_block(match):
+        return sub_block(match,indent=True)
+    # TODO not sure newline is correct after opening braces {{{code}}}
+    text = re.compile(r'^{{{\n(.*?)^}}}', re.M|re.S).sub(sub_pre_block, text)
+
+    #  Pull out `backtick` code quotes 
+    #def sub_code(match)
+     #   return sub_block(match,indent=False)
+    text = re.compile(r'^`(.*?)^`', re.M|re.S).sub(r'##{{{\1}}}##', text) # monospace literal for Creole 
+    
+    # Headings - No conversion needed for Creole. Markdown conversion below
+    #text = re.compile(r'^===(.*?)===\s*$', re.M).sub(lambda m: "### %s\n"%m.group(1).strip(), text)
+    #text = re.compile(r'^==(.*?)==\s*$', re.M).sub(lambda m: "## %s\n"%m.group(1).strip(), text)
+    #text = re.compile(r'^=(.*?)=\s*$', re.M).sub(lambda m: "# %s\n"%m.group(1).strip(), text)
+
+    # Tables
+    def sub_table_html(m):
+        rows = []
+        for line in m.group(0).splitlines(False):
+            if not line.strip():
+                continue
+            rows.append(list(c.strip() for c in line.split("||")[1:-1]))
+        lines = ['<table>']
+        for row in rows:
+            lines.append('  <tr>%s</tr>' % ''.join('<td>%s</td>' % c for c in row))
+        lines.append('</table>')
+        return '\n\n' + '\n'.join(lines)
+    def sub_table_creole(m):
+        rows = []
+        for line in m.group(0).splitlines(False):
+            if not line.strip():
+                continue
+            rows.append(list(c.strip() for c in line.split("||")[1:-1]))
+        lines = []
+        # Assume first row is a header (or should we assume the reverse?)
+        if rows:
+            lines.append('|='.join(rows[0])[:-1] )# skip trailing equal sign
+            for row in rows[1:]:
+                lines.append('|'.join(row))
+            return '\n\n' + '\n'.join(lines)
+#    text = re.compile(r'\n(\n^\|\|(.*?\|\|)+$)+', re.M).sub(sub_table, text)
+    text = re.compile(r'\n(\n^\|\|(.*?\|\|)+$)+', re.M).sub(sub_table_creole, text)
+
+    # Lists (doesn't handle nested lists).
+    # TODO: leave bullet marker unchanged for *, -, +
+    text = re.compile(r'^[ \t]+\*[ \t]+(.*?)[ \t]*$', re.M).sub(r'* \1', text)
+    text = re.compile(r'^[ \t]+#[ \t]+(.*?)[ \t]*$', re.M).sub(r'1. \1', text)
+
+    # wiki links. - Creole & Markdown are the same - no change required to conversion
+    def sub_wikilink(m):
+        gh_page_name = _gh_page_name_from_gc_page_name(m.group(1)).replace('-', ' ')
+        if m.group(2):
+            s = "[[%s|%s]]" % (gh_page_name, m.group(2))
+            pass
+        else:
+            s = "[[%s]]" % gh_page_name
+        hash = md5(s.encode('utf8')).hexdigest()
+        s_from_hash[hash] = s
+        return hash
+    text = re.compile(r'\[((?:[A-Z][a-z]+)+)(?:\s+(.*?))?\]', re.S).sub(sub_wikilink, text)
+
+    # Links
+    def sub_link(m):
+        # s = "[%s](%s)" % (m.group(2), m.group(1)) # Markdown
+        s = "[[%s|%s]]" % (m.group(1), m.group(2)) # Creole
+        hash = md5(s.encode('utf8')).hexdigest()
+        s_from_hash[hash] = s
+        return hash
+    text = re.compile(r'(?<!\[)\[([^\s]+)\s+(.*?)\](?!\])', re.S).sub(sub_link, text)
+
+    # Italics, bold. - same for both Markdown & Creole
+    # in*ter*bold: (?<=\w)(\*\w+?\*)(?=\w)
+    text = re.compile(r'(?<![*\w])\*([^*]+?)\*(?![*\w])', re.S).sub(r'**\1**', text)
+    text = re.compile(r'(?<![_\w])_([^_]+?)_(?![_\w])', re.S).sub(r'*\1*', text)
+
+    # Auto-linking "issue \d+"
+    # TODO: Construct Google Code -> Github issue lookup map
+    text = re.compile(r'(?<!\[)(issue (\d+))(?!\])').sub(
+        r'[\1](https://github.com/%s/issues#issue/\2)' % proj_id, text)
+
+    # Restore hashed-out blocks.
+    for hash, s in s_from_hash.items():
+        text = text.replace(hash, s)
+
+    # Add summary.
+    if "summary" in meta:
+        text = ("# %s\n\n" % meta["summary"]) + text
+
+    base = splitext(basename(src_path))[0]
+    gh_page_name = _gh_page_name_from_gc_page_name(base)
+#    dst_path = join(dst_dir, gh_page_name+".md")
+    dst_path = join(dst_dir, gh_page_name+".creole")
+    if not exists(dst_path) or codecs.open(dst_path, 'r', 'utf-8').read() != text:
+        codecs.open(dst_path, 'w', 'utf-8').write(text)
+        log("wrote '%s'" % dst_path)
+
+
+#---- internal support stuff
+
+def _indent(text):
+    return '    ' + '\n    '.join(text.splitlines(False))
+
+def _gh_page_name_from_gc_page_name(gc):
+    """Github (gh) Wiki page name from Google Code (gc) Wiki page name."""
+    # FIXME: fails on all uppercase / all lowercase names (e.g. FAQ)
+    gh = re.sub(r'([A-Z][a-z]+)', r'-\1', gc)[1:]
+    return gh
+
+
+#---- mainline
+
+if __name__ == '__main__':
+    convert_dir("OpenRefine/OpenRefine", "c:/users/tfmorris/tmp/grefine-wiki", "c:/users/tfmorris/tmp/orefine-wiki")
+    if len(sys.argv) != 4:
+        print __doc__
+        sys.exit(1)
+    convert_dir(sys.argv[1], sys.argv[2], sys.argv[3])

From ea11c68f9cab719b6c4aa86b84e5fe6b23d2caaa Mon Sep 17 00:00:00 2001
From: Tom Morris <tfmorris@gmail.com>
Date: Fri, 21 Dec 2012 18:46:45 -0500
Subject: [PATCH 03/16] Kill dead code and fix some bugs

Remove unused code
Fix tables, links, & bulletted lists
---
 wikiconvert_creole.py | 27 ++++++++-------------------
 1 file changed, 8 insertions(+), 19 deletions(-)

diff --git a/wikiconvert_creole.py b/wikiconvert_creole.py
index fa5a6ae..f8e328b 100644
--- a/wikiconvert_creole.py
+++ b/wikiconvert_creole.py
@@ -66,23 +66,9 @@ def sub_pre_block(match):
      #   return sub_block(match,indent=False)
     text = re.compile(r'^`(.*?)^`', re.M|re.S).sub(r'##{{{\1}}}##', text) # monospace literal for Creole 
     
-    # Headings - No conversion needed for Creole. Markdown conversion below
-    #text = re.compile(r'^===(.*?)===\s*$', re.M).sub(lambda m: "### %s\n"%m.group(1).strip(), text)
-    #text = re.compile(r'^==(.*?)==\s*$', re.M).sub(lambda m: "## %s\n"%m.group(1).strip(), text)
-    #text = re.compile(r'^=(.*?)=\s*$', re.M).sub(lambda m: "# %s\n"%m.group(1).strip(), text)
+    # Headings - No conversion needed for Creole. 
 
     # Tables
-    def sub_table_html(m):
-        rows = []
-        for line in m.group(0).splitlines(False):
-            if not line.strip():
-                continue
-            rows.append(list(c.strip() for c in line.split("||")[1:-1]))
-        lines = ['<table>']
-        for row in rows:
-            lines.append('  <tr>%s</tr>' % ''.join('<td>%s</td>' % c for c in row))
-        lines.append('</table>')
-        return '\n\n' + '\n'.join(lines)
     def sub_table_creole(m):
         rows = []
         for line in m.group(0).splitlines(False):
@@ -92,16 +78,15 @@ def sub_table_creole(m):
         lines = []
         # Assume first row is a header (or should we assume the reverse?)
         if rows:
-            lines.append('|='.join(rows[0])[:-1] )# skip trailing equal sign
+            lines.append('|=''+|='.join(rows[0])+'|')
             for row in rows[1:]:
                 lines.append('|'.join(row))
             return '\n\n' + '\n'.join(lines)
-#    text = re.compile(r'\n(\n^\|\|(.*?\|\|)+$)+', re.M).sub(sub_table, text)
     text = re.compile(r'\n(\n^\|\|(.*?\|\|)+$)+', re.M).sub(sub_table_creole, text)
 
     # Lists (doesn't handle nested lists).
     # TODO: leave bullet marker unchanged for *, -, +
-    text = re.compile(r'^[ \t]+\*[ \t]+(.*?)[ \t]*$', re.M).sub(r'* \1', text)
+#    text = re.compile(r'^[ \t]+\*[ \t]+(.*?)[ \t]*$', re.M).sub(r'* \1', text)
     text = re.compile(r'^[ \t]+#[ \t]+(.*?)[ \t]*$', re.M).sub(r'1. \1', text)
 
     # wiki links. - Creole & Markdown are the same - no change required to conversion
@@ -120,7 +105,7 @@ def sub_wikilink(m):
     # Links
     def sub_link(m):
         # s = "[%s](%s)" % (m.group(2), m.group(1)) # Markdown
-        s = "[[%s|%s]]" % (m.group(1), m.group(2)) # Creole
+        s = "[[%s|%s]]" % (m.group(2), m.group(1)) # Creole
         hash = md5(s.encode('utf8')).hexdigest()
         s_from_hash[hash] = s
         return hash
@@ -136,6 +121,10 @@ def sub_link(m):
     text = re.compile(r'(?<!\[)(issue (\d+))(?!\])').sub(
         r'[\1](https://github.com/%s/issues#issue/\2)' % proj_id, text)
 
+    # TODO - convert google groups references
+    # from http://groups.google.com/group/google-refine/
+    # to http://groups.google.com/group/openrefine
+
     # Restore hashed-out blocks.
     for hash, s in s_from_hash.items():
         text = text.replace(hash, s)

From aebe587b7634368eba64007ca50ec401f09f09b9 Mon Sep 17 00:00:00 2001
From: Tom Morris <tfmorris@gmail.com>
Date: Fri, 21 Dec 2012 18:51:37 -0500
Subject: [PATCH 04/16] Fix typo in table conversion

---
 wikiconvert_creole.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/wikiconvert_creole.py b/wikiconvert_creole.py
index f8e328b..2377ce3 100644
--- a/wikiconvert_creole.py
+++ b/wikiconvert_creole.py
@@ -78,7 +78,7 @@ def sub_table_creole(m):
         lines = []
         # Assume first row is a header (or should we assume the reverse?)
         if rows:
-            lines.append('|=''+|='.join(rows[0])+'|')
+            lines.append('|='+'|='.join(rows[0])+'|')
             for row in rows[1:]:
                 lines.append('|'.join(row))
             return '\n\n' + '\n'.join(lines)

From ed358993b20b5905fb9717a5a7aad9395478a7ad Mon Sep 17 00:00:00 2001
From: Tom Morris <tfmorris@gmail.com>
Date: Fri, 21 Dec 2012 23:42:46 -0500
Subject: [PATCH 05/16] Fix table bodies too

---
 wikiconvert_creole.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/wikiconvert_creole.py b/wikiconvert_creole.py
index 2377ce3..f43ff7d 100644
--- a/wikiconvert_creole.py
+++ b/wikiconvert_creole.py
@@ -80,7 +80,7 @@ def sub_table_creole(m):
         if rows:
             lines.append('|='+'|='.join(rows[0])+'|')
             for row in rows[1:]:
-                lines.append('|'.join(row))
+                lines.append('|'+'|'.join(row)+'|')
             return '\n\n' + '\n'.join(lines)
     text = re.compile(r'\n(\n^\|\|(.*?\|\|)+$)+', re.M).sub(sub_table_creole, text)
 

From 73217379bca9a5a82d74e4d6775a267ecb8949b5 Mon Sep 17 00:00:00 2001
From: Tom Morris <tfmorris@gmail.com>
Date: Fri, 21 Dec 2012 23:59:10 -0500
Subject: [PATCH 06/16] Fix bulletted lists, add more TODOs

---
 wikiconvert_creole.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/wikiconvert_creole.py b/wikiconvert_creole.py
index f43ff7d..6e2c97b 100644
--- a/wikiconvert_creole.py
+++ b/wikiconvert_creole.py
@@ -86,7 +86,7 @@ def sub_table_creole(m):
 
     # Lists (doesn't handle nested lists).
     # TODO: leave bullet marker unchanged for *, -, +
-#    text = re.compile(r'^[ \t]+\*[ \t]+(.*?)[ \t]*$', re.M).sub(r'* \1', text)
+    text = re.compile(r'^[ \t]+\*[ \t]+(.*?)[ \t]*$', re.M).sub(r' \1', text)
     text = re.compile(r'^[ \t]+#[ \t]+(.*?)[ \t]*$', re.M).sub(r'1. \1', text)
 
     # wiki links. - Creole & Markdown are the same - no change required to conversion
@@ -125,10 +125,14 @@ def sub_link(m):
     # from http://groups.google.com/group/google-refine/
     # to http://groups.google.com/group/openrefine
 
-    # Restore hashed-out blocks.
+    # TODO: replace Google Refine with OpenRefine everywyere
+
+      # Restore hashed-out blocks.
     for hash, s in s_from_hash.items():
         text = text.replace(hash, s)
 
+    # TODO remove or replace #summary header pragmas
+  
     # Add summary.
     if "summary" in meta:
         text = ("# %s\n\n" % meta["summary"]) + text

From bef98984c96efbf8bc27c0501db1eb219d2517c6 Mon Sep 17 00:00:00 2001
From: Tom Morris <tfmorris@gmail.com>
Date: Sat, 22 Dec 2012 00:31:00 -0500
Subject: [PATCH 07/16] Swap order of handling for lists and bolding

---
 wikiconvert_creole.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/wikiconvert_creole.py b/wikiconvert_creole.py
index 6e2c97b..1a36b16 100644
--- a/wikiconvert_creole.py
+++ b/wikiconvert_creole.py
@@ -84,9 +84,15 @@ def sub_table_creole(m):
             return '\n\n' + '\n'.join(lines)
     text = re.compile(r'\n(\n^\|\|(.*?\|\|)+$)+', re.M).sub(sub_table_creole, text)
 
+
+    # Italics, bold. - same for both Markdown & Creole
+    # in*ter*bold: (?<=\w)(\*\w+?\*)(?=\w)
+    text = re.compile(r'(?<![*\w])\*([^*]+?)\*(?![*\w])', re.S).sub(r'**\1**', text)
+    text = re.compile(r'(?<![_\w])_([^_]+?)_(?![_\w])', re.S).sub(r'*\1*', text)
+    
     # Lists (doesn't handle nested lists).
     # TODO: leave bullet marker unchanged for *, -, +
-    text = re.compile(r'^[ \t]+\*[ \t]+(.*?)[ \t]*$', re.M).sub(r' \1', text)
+    text = re.compile(r'^  \*[ \t]+(.*?)[ \t]*$', re.M).sub(r'* \1', text)
     text = re.compile(r'^[ \t]+#[ \t]+(.*?)[ \t]*$', re.M).sub(r'1. \1', text)
 
     # wiki links. - Creole & Markdown are the same - no change required to conversion
@@ -111,10 +117,6 @@ def sub_link(m):
         return hash
     text = re.compile(r'(?<!\[)\[([^\s]+)\s+(.*?)\](?!\])', re.S).sub(sub_link, text)
 
-    # Italics, bold. - same for both Markdown & Creole
-    # in*ter*bold: (?<=\w)(\*\w+?\*)(?=\w)
-    text = re.compile(r'(?<![*\w])\*([^*]+?)\*(?![*\w])', re.S).sub(r'**\1**', text)
-    text = re.compile(r'(?<![_\w])_([^_]+?)_(?![_\w])', re.S).sub(r'*\1*', text)
 
     # Auto-linking "issue \d+"
     # TODO: Construct Google Code -> Github issue lookup map

From 51e1ee5c3db1b4da1f233dbc20ca4a8a67b20f17 Mon Sep 17 00:00:00 2001
From: Tom Morris <tfmorris@gmail.com>
Date: Sat, 22 Dec 2012 10:40:08 -0500
Subject: [PATCH 08/16] Fix bulletted lists for real

Use a temporary marker so it doesn't conflict with bold processing
---
 wikiconvert_creole.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/wikiconvert_creole.py b/wikiconvert_creole.py
index 1a36b16..53a8b16 100644
--- a/wikiconvert_creole.py
+++ b/wikiconvert_creole.py
@@ -84,16 +84,17 @@ def sub_table_creole(m):
             return '\n\n' + '\n'.join(lines)
     text = re.compile(r'\n(\n^\|\|(.*?\|\|)+$)+', re.M).sub(sub_table_creole, text)
 
+    # Lists (doesn't handle nested lists - flattends structure).
+    text = re.compile(r'^[ \t]+\*[ \t]+(.*?)$', re.M).sub(r'{^} \1', text) # temp marker to avoid bold processing
+    text = re.compile(r'^[ \t]+#[ \t]+(.*?)[ \t]*$', re.M).sub(r'1. \1', text)
 
     # Italics, bold. - same for both Markdown & Creole
     # in*ter*bold: (?<=\w)(\*\w+?\*)(?=\w)
     text = re.compile(r'(?<![*\w])\*([^*]+?)\*(?![*\w])', re.S).sub(r'**\1**', text)
     text = re.compile(r'(?<![_\w])_([^_]+?)_(?![_\w])', re.S).sub(r'*\1*', text)
-    
-    # Lists (doesn't handle nested lists).
-    # TODO: leave bullet marker unchanged for *, -, +
-    text = re.compile(r'^  \*[ \t]+(.*?)[ \t]*$', re.M).sub(r'* \1', text)
-    text = re.compile(r'^[ \t]+#[ \t]+(.*?)[ \t]*$', re.M).sub(r'1. \1', text)
+
+    # Swap our temporary bulllet marker back out
+    text = text.replace('{^}','*')
 
     # wiki links. - Creole & Markdown are the same - no change required to conversion
     def sub_wikilink(m):

From bb1a1932c87bb8dea53fe5e3e412b6a9643e9fcf Mon Sep 17 00:00:00 2001
From: Tom Morris <tfmorris@gmail.com>
Date: Sat, 22 Dec 2012 11:47:57 -0500
Subject: [PATCH 09/16] More fixes

- Fix page name conversion to handle non-camel case
- Fix up summary formatting
- Change Google Refine to OpenRefine
- Fix mailing list pointers
---
 wikiconvert_creole.py | 35 +++++++++++++++++------------------
 1 file changed, 17 insertions(+), 18 deletions(-)

diff --git a/wikiconvert_creole.py b/wikiconvert_creole.py
index 53a8b16..f18e044 100644
--- a/wikiconvert_creole.py
+++ b/wikiconvert_creole.py
@@ -29,7 +29,7 @@ def convert_dir(proj_id, src_dir, dst_dir):
     else:
         for f in glob(join(src_dir, "*.wiki")):
             convert_file(proj_id, f, dst_dir)
-
+    
 def convert_file(proj_id, src_path, dst_dir):
     src = codecs.open(src_path, 'r', 'utf-8').read()
     meta_lines = []
@@ -56,8 +56,10 @@ def sub_block(match,indent=True):
         hash = md5(pre.encode('utf8')).hexdigest()
         s_from_hash[hash] = _indent(pre) if indent else pre
         return hash
+        
     def sub_pre_block(match):
         return sub_block(match,indent=True)
+
     # TODO not sure newline is correct after opening braces {{{code}}}
     text = re.compile(r'^{{{\n(.*?)^}}}', re.M|re.S).sub(sub_pre_block, text)
 
@@ -84,9 +86,9 @@ def sub_table_creole(m):
             return '\n\n' + '\n'.join(lines)
     text = re.compile(r'\n(\n^\|\|(.*?\|\|)+$)+', re.M).sub(sub_table_creole, text)
 
-    # Lists (doesn't handle nested lists - flattends structure).
+    # Lists (doesn't handle nested lists - flattens structure).
     text = re.compile(r'^[ \t]+\*[ \t]+(.*?)$', re.M).sub(r'{^} \1', text) # temp marker to avoid bold processing
-    text = re.compile(r'^[ \t]+#[ \t]+(.*?)[ \t]*$', re.M).sub(r'1. \1', text)
+    text = re.compile(r'^[ \t]+#[ \t]+(.*?)$', re.M).sub(r'1. \1', text)
 
     # Italics, bold. - same for both Markdown & Creole
     # in*ter*bold: (?<=\w)(\*\w+?\*)(?=\w)
@@ -118,27 +120,23 @@ def sub_link(m):
         return hash
     text = re.compile(r'(?<!\[)\[([^\s]+)\s+(.*?)\](?!\])', re.S).sub(sub_link, text)
 
-
     # Auto-linking "issue \d+"
     # TODO: Construct Google Code -> Github issue lookup map
     text = re.compile(r'(?<!\[)(issue (\d+))(?!\])').sub(
         r'[\1](https://github.com/%s/issues#issue/\2)' % proj_id, text)
 
-    # TODO - convert google groups references
-    # from http://groups.google.com/group/google-refine/
-    # to http://groups.google.com/group/openrefine
+    # Project specific replacements for naming, mailing lists, & issues
+    text = text.replace('Google Refine','OpenRefine')
+    text = text.replace('http://groups.google.com/group/google-refine',
+                 'http://groups.google.com/group/openrefine')
 
-    # TODO: replace Google Refine with OpenRefine everywyere
-
-      # Restore hashed-out blocks.
+    # Restore hashed-out blocks.
     for hash, s in s_from_hash.items():
         text = text.replace(hash, s)
-
-    # TODO remove or replace #summary header pragmas
-  
-    # Add summary.
+ 
+    #  Prepend summary.(not sure whether h3 or italics is best option here)
     if "summary" in meta:
-        text = ("# %s\n\n" % meta["summary"]) + text
+        text = ("//%s//\n\n" % meta["summary"]) + text
 
     base = splitext(basename(src_path))[0]
     gh_page_name = _gh_page_name_from_gc_page_name(base)
@@ -156,11 +154,12 @@ def _indent(text):
 
 def _gh_page_name_from_gc_page_name(gc):
     """Github (gh) Wiki page name from Google Code (gc) Wiki page name."""
-    # FIXME: fails on all uppercase / all lowercase names (e.g. FAQ)
-    gh = re.sub(r'([A-Z][a-z]+)', r'-\1', gc)[1:]
+    if re.match(r'[A-Z][a-z]{2,}',gc):
+        gh = re.sub(r'([A-Z][a-z]+)', r'-\1', gc)[1:]
+    else:
+        gh = gc
     return gh
 
-
 #---- mainline
 
 if __name__ == '__main__':

From e8d2d6418bd11535cb84ba49495f699c120ebefc Mon Sep 17 00:00:00 2001
From: Tom Morris <tfmorris@gmail.com>
Date: Sat, 22 Dec 2012 13:18:36 -0500
Subject: [PATCH 10/16] Fix links & code blocks

- Allow inline code blocks (ie no newline required)
- make http link matching less aggressive (may still have problems)
- Move name substituion last so it fixes summaries, code examples, and
links
- Add debug code to warn when we failed to restore text blocks (still a
few issues left)
---
 wikiconvert_creole.py | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/wikiconvert_creole.py b/wikiconvert_creole.py
index f18e044..242a9fd 100644
--- a/wikiconvert_creole.py
+++ b/wikiconvert_creole.py
@@ -60,13 +60,12 @@ def sub_block(match,indent=True):
     def sub_pre_block(match):
         return sub_block(match,indent=True)
 
-    # TODO not sure newline is correct after opening braces {{{code}}}
-    text = re.compile(r'^{{{\n(.*?)^}}}', re.M|re.S).sub(sub_pre_block, text)
+    text = re.compile(r'^{{{(.*?)}}}', re.M|re.S).sub(sub_pre_block, text)
 
     #  Pull out `backtick` code quotes 
     #def sub_code(match)
      #   return sub_block(match,indent=False)
-    text = re.compile(r'^`(.*?)^`', re.M|re.S).sub(r'##{{{\1}}}##', text) # monospace literal for Creole 
+    text = re.compile(r'`(.*?)`', re.M|re.S).sub(r'##{{{\1}}}##', text) # monospace literal for Creole 
     
     # Headings - No conversion needed for Creole. 
 
@@ -118,26 +117,31 @@ def sub_link(m):
         hash = md5(s.encode('utf8')).hexdigest()
         s_from_hash[hash] = s
         return hash
-    text = re.compile(r'(?<!\[)\[([^\s]+)\s+(.*?)\](?!\])', re.S).sub(sub_link, text)
+    # NOTE: this only matches http & ftp links currently
+    text = re.compile(r'(?<!\[)\[((http|ftp):[^\s]+)\s+(.*?)\](?!\])', re.S).sub(sub_link, text)
 
     # Auto-linking "issue \d+"
     # TODO: Construct Google Code -> Github issue lookup map
     text = re.compile(r'(?<!\[)(issue (\d+))(?!\])').sub(
         r'[\1](https://github.com/%s/issues#issue/\2)' % proj_id, text)
 
-    # Project specific replacements for naming, mailing lists, & issues
-    text = text.replace('Google Refine','OpenRefine')
-    text = text.replace('http://groups.google.com/group/google-refine',
-                 'http://groups.google.com/group/openrefine')
-
     # Restore hashed-out blocks.
     for hash, s in s_from_hash.items():
+        if text == text.replace(hash,s):
+            print 'Contained = ',text.find(hash)
+            print 'Failed to replace %s with %s' % (hash,s)
         text = text.replace(hash, s)
  
     #  Prepend summary.(not sure whether h3 or italics is best option here)
     if "summary" in meta:
         text = ("//%s//\n\n" % meta["summary"]) + text
 
+    # Project specific replacements for naming, mailing lists, & issues
+    text = text.replace('Google Refine','OpenRefine')
+    text = text.replace('http://groups.google.com/group/google-refine',
+                 'http://groups.google.com/group/openrefine')
+
+
     base = splitext(basename(src_path))[0]
     gh_page_name = _gh_page_name_from_gc_page_name(base)
 #    dst_path = join(dst_dir, gh_page_name+".md")

From dccf041c5fccb3508b60eb1a580797a0c4ae93bd Mon Sep 17 00:00:00 2001
From: Tom Morris <tfmorris@gmail.com>
Date: Sat, 22 Dec 2012 13:53:49 -0500
Subject: [PATCH 11/16] Fix http & repo links

- Fix label on http links
- Map Google Code repo links to their Github equivalents
---
 wikiconvert_creole.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/wikiconvert_creole.py b/wikiconvert_creole.py
index 242a9fd..e707588 100644
--- a/wikiconvert_creole.py
+++ b/wikiconvert_creole.py
@@ -118,7 +118,7 @@ def sub_link(m):
         s_from_hash[hash] = s
         return hash
     # NOTE: this only matches http & ftp links currently
-    text = re.compile(r'(?<!\[)\[((http|ftp):[^\s]+)\s+(.*?)\](?!\])', re.S).sub(sub_link, text)
+    text = re.compile(r'(?<!\[)\[((?:http|ftp):[^\s]+)\s+(.*?)\](?!\])', re.S).sub(sub_link, text)
 
     # Auto-linking "issue \d+"
     # TODO: Construct Google Code -> Github issue lookup map
@@ -136,11 +136,14 @@ def sub_link(m):
     if "summary" in meta:
         text = ("//%s//\n\n" % meta["summary"]) + text
 
-    # Project specific replacements for naming, mailing lists, & issues
+    # Project specific replacements for naming, mailing lists, & code
     text = text.replace('Google Refine','OpenRefine')
-    text = text.replace('http://groups.google.com/group/google-refine',
-                 'http://groups.google.com/group/openrefine')
-
+    text = text.replace(
+                    'http://groups.google.com/group/google-refine',
+                     'http://groups.google.com/group/openrefine')
+    text = text.replace(
+                        'code.google.com/p/google-refine/source/browse/trunk/',
+                        'github.com/OpenRefine/OpenRefine/blob/master/')
 
     base = splitext(basename(src_path))[0]
     gh_page_name = _gh_page_name_from_gc_page_name(base)

From 3d8ceb5873f54ec361b03f38b743cd9bba61379c Mon Sep 17 00:00:00 2001
From: Tom Morris <tfmorris@gmail.com>
Date: Sat, 22 Dec 2012 14:15:53 -0500
Subject: [PATCH 12/16] Misc link fixes

- exclude protocol from GC to Github link conversion so https works
- remove ## surrounding inline code
- fix intrawiki links
---
 wikiconvert_creole.py | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/wikiconvert_creole.py b/wikiconvert_creole.py
index e707588..83efe13 100644
--- a/wikiconvert_creole.py
+++ b/wikiconvert_creole.py
@@ -65,7 +65,7 @@ def sub_pre_block(match):
     #  Pull out `backtick` code quotes 
     #def sub_code(match)
      #   return sub_block(match,indent=False)
-    text = re.compile(r'`(.*?)`', re.M|re.S).sub(r'##{{{\1}}}##', text) # monospace literal for Creole 
+    text = re.compile(r'`(.*?)`', re.M|re.S).sub(r'{{{\1}}}', text) # monospace literal for Creole 
     
     # Headings - No conversion needed for Creole. 
 
@@ -97,12 +97,11 @@ def sub_table_creole(m):
     # Swap our temporary bulllet marker back out
     text = text.replace('{^}','*')
 
-    # wiki links. - Creole & Markdown are the same - no change required to conversion
+    # wiki links.
     def sub_wikilink(m):
         gh_page_name = _gh_page_name_from_gc_page_name(m.group(1)).replace('-', ' ')
         if m.group(2):
-            s = "[[%s|%s]]" % (gh_page_name, m.group(2))
-            pass
+            s = "[[%s|%s]]" % ( m.group(2),gh_page_name)
         else:
             s = "[[%s]]" % gh_page_name
         hash = md5(s.encode('utf8')).hexdigest()
@@ -139,8 +138,8 @@ def sub_link(m):
     # Project specific replacements for naming, mailing lists, & code
     text = text.replace('Google Refine','OpenRefine')
     text = text.replace(
-                    'http://groups.google.com/group/google-refine',
-                     'http://groups.google.com/group/openrefine')
+                    '://groups.google.com/group/google-refine',
+                     '://groups.google.com/group/openrefine')
     text = text.replace(
                         'code.google.com/p/google-refine/source/browse/trunk/',
                         'github.com/OpenRefine/OpenRefine/blob/master/')

From 17d8d5668f83aae8036cf5e4aadcdd8c5631f88f Mon Sep 17 00:00:00 2001
From: Tom Morris <tfmorris@gmail.com>
Date: Sat, 22 Dec 2012 14:59:39 -0500
Subject: [PATCH 13/16] Fix code blocks & add warning

- Fix code block syntax (wasn't using Creole)
- Prepend warning to top of page to prompt human review
---
 wikiconvert_creole.py | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/wikiconvert_creole.py b/wikiconvert_creole.py
index 83efe13..e518ffc 100644
--- a/wikiconvert_creole.py
+++ b/wikiconvert_creole.py
@@ -54,7 +54,8 @@ def convert_file(proj_id, src_path, dst_dir):
     def sub_block(match,indent=True):
         pre = match.group(1)
         hash = md5(pre.encode('utf8')).hexdigest()
-        s_from_hash[hash] = _indent(pre) if indent else pre
+        # Creole uses braces, not indentation for code blocks
+        s_from_hash[hash] = "{{{"+pre+"}}}" if indent else pre
         return hash
         
     def sub_pre_block(match):
@@ -127,11 +128,20 @@ def sub_link(m):
     # Restore hashed-out blocks.
     for hash, s in s_from_hash.items():
         if text == text.replace(hash,s):
-            print 'Contained = ',text.find(hash)
             print 'Failed to replace %s with %s' % (hash,s)
         text = text.replace(hash, s)
+
+    base = splitext(basename(src_path))[0]
+
+    # Prepend warning block for manual review
+    text = '----\n**NOTE**: This page was automatically converted from the ' \
+           + ('[[old page|http://code.google.com/p/google-refine/wiki/%s]] ' % base) \
+           + ' at Google Code and has not been manually reviewed.  Please compare it to  ' \
+           + ('[[the original|http://code.google.com/p/google-refine/wiki/%s]] ' % base) \
+           + ', correct any errors or omissions, then remove this warning block.\n----\n\n\n' \
+           +text
  
-    #  Prepend summary.(not sure whether h3 or italics is best option here)
+    #  Prepend summary
     if "summary" in meta:
         text = ("//%s//\n\n" % meta["summary"]) + text
 
@@ -144,9 +154,7 @@ def sub_link(m):
                         'code.google.com/p/google-refine/source/browse/trunk/',
                         'github.com/OpenRefine/OpenRefine/blob/master/')
 
-    base = splitext(basename(src_path))[0]
     gh_page_name = _gh_page_name_from_gc_page_name(base)
-#    dst_path = join(dst_dir, gh_page_name+".md")
     dst_path = join(dst_dir, gh_page_name+".creole")
     if not exists(dst_path) or codecs.open(dst_path, 'r', 'utf-8').read() != text:
         codecs.open(dst_path, 'w', 'utf-8').write(text)

From 89be3ab72bd93bf0f1de59f6fa0f8895a1b2e146 Mon Sep 17 00:00:00 2001
From: Tom Morris <tfmorris@gmail.com>
Date: Sun, 23 Dec 2012 13:09:39 -0500
Subject: [PATCH 14/16] Fix numbered lists using correct Creole syntax

---
 wikiconvert_creole.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/wikiconvert_creole.py b/wikiconvert_creole.py
index e518ffc..db9905e 100644
--- a/wikiconvert_creole.py
+++ b/wikiconvert_creole.py
@@ -88,7 +88,7 @@ def sub_table_creole(m):
 
     # Lists (doesn't handle nested lists - flattens structure).
     text = re.compile(r'^[ \t]+\*[ \t]+(.*?)$', re.M).sub(r'{^} \1', text) # temp marker to avoid bold processing
-    text = re.compile(r'^[ \t]+#[ \t]+(.*?)$', re.M).sub(r'1. \1', text)
+    text = re.compile(r'^[ \t]+#[ \t]+(.*?)$', re.M).sub(r'# \1', text)
 
     # Italics, bold. - same for both Markdown & Creole
     # in*ter*bold: (?<=\w)(\*\w+?\*)(?=\w)

From fd0e858d388a62ba6ae70a46ffe46cb66cfe19ec Mon Sep 17 00:00:00 2001
From: Tom Morris <tfmorris@gmail.com>
Date: Wed, 26 Dec 2012 11:34:57 -0500
Subject: [PATCH 15/16] Improve page name conversion

---
 wikiconvert_creole.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/wikiconvert_creole.py b/wikiconvert_creole.py
index db9905e..d30fba6 100644
--- a/wikiconvert_creole.py
+++ b/wikiconvert_creole.py
@@ -168,8 +168,9 @@ def _indent(text):
 
 def _gh_page_name_from_gc_page_name(gc):
     """Github (gh) Wiki page name from Google Code (gc) Wiki page name."""
-    if re.match(r'[A-Z][a-z]{2,}',gc):
-        gh = re.sub(r'([A-Z][a-z]+)', r'-\1', gc)[1:]
+    # Note: We can't handle both FetchingURLsFromWebServices & GRELFunctions
+    if re.match(r'[A-Z]+[a-z]{2,}',gc):
+        gh = re.sub(r'([A-Z]+[a-z]+)', r'-\1', gc)[1:]
     else:
         gh = gc
     return gh

From b9b1a220ad4ff17700b9839a62e5c79a615752e7 Mon Sep 17 00:00:00 2001
From: Ilya Lipnitskiy <ilya.lipnitskiy@gmail.com>
Date: Sat, 9 Nov 2013 01:11:22 -0800
Subject: [PATCH 16/16] Fix a few parsing bugs

---
 wikiconvert.py | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/wikiconvert.py b/wikiconvert.py
index 683a625..97203b1 100644
--- a/wikiconvert.py
+++ b/wikiconvert.py
@@ -38,10 +38,8 @@ def convert_file(proj_id, src_path, dst_dir):
     for i, line in enumerate(lines):
         if line.startswith("#"):
             meta_lines.append(line)
-        else:
-            assert not line.strip(), "line isn't empty in file %s %r" % (src_path, line)
-            # TODO is it actually mandatory that a blank line separate meta text from body text?
-            body_lines = lines[i+1:]
+	elif line.strip():
+            body_lines = lines[i:]
             break
     meta = {}
     for line in meta_lines:
@@ -58,6 +56,7 @@ def convert_file(proj_id, src_path, dst_dir):
     text = re.compile(r'^}}}+ *(\n|$)', re.M).sub(r"```\n", text)
 
     # TODO: Add support for `backtick` code quotes
+    text = re.sub(r'{{{(.*?)}}}', r'`\1`', text)
     
     # Headings.
     text = re.compile(r'^===(.*?)===\s*$', re.M).sub(lambda m: "### %s\n"%m.group(1).strip(), text)
@@ -131,12 +130,11 @@ def sub_link(m):
 #---- internal support stuff
 
 def _indent(text):
-    return '    ' + '\n    '.join(text.splitlines(False))
+    return '\n    ' + '\n    '.join(text.splitlines(False))
 
 def _gh_page_name_from_gc_page_name(gc):
     """Github (gh) Wiki page name from Google Code (gc) Wiki page name."""
-    # FIXME: fails on all uppercase / all lowercase names (e.g. FAQ)
-    gh = re.sub(r'([A-Z][a-z]+)', r'-\1', gc)[1:]
+    gh = re.sub(r'([A-Za-z]+)_?', r'-\1', gc)[1:]
     return gh