From 621c65caf24ad55da6d741d4241e283946f8f195 Mon Sep 17 00:00:00 2001 From: Tom Morris Date: Fri, 21 Dec 2012 18:15:01 -0500 Subject: [PATCH 01/16] Improve error message and add TODOs --- wikiconvert.py | 10 ++- wikiconvert_creole.py | 172 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 178 insertions(+), 4 deletions(-) create mode 100644 wikiconvert_creole.py diff --git a/wikiconvert.py b/wikiconvert.py index d09705d..683a625 100644 --- a/wikiconvert.py +++ b/wikiconvert.py @@ -39,10 +39,9 @@ def convert_file(proj_id, src_path, dst_dir): if line.startswith("#"): meta_lines.append(line) else: - if not line.strip(): - body_lines = lines[i+1:] - else: - body_lines = lines[i:] + assert not line.strip(), "line isn't empty in file %s %r" % (src_path, line) + # TODO is it actually mandatory that a blank line separate meta text from body text? + body_lines = lines[i+1:] break meta = {} for line in meta_lines: @@ -58,6 +57,8 @@ def convert_file(proj_id, src_path, dst_dir): text = re.compile(r'^{{{+ *\n', re.M).sub(r"```\n", text) text = re.compile(r'^}}}+ *(\n|$)', re.M).sub(r"```\n", text) + # TODO: Add support for `backtick` code quotes + # Headings. text = re.compile(r'^===(.*?)===\s*$', re.M).sub(lambda m: "### %s\n"%m.group(1).strip(), text) text = re.compile(r'^==(.*?)==\s*$', re.M).sub(lambda m: "## %s\n"%m.group(1).strip(), text) @@ -134,6 +135,7 @@ def _indent(text): def _gh_page_name_from_gc_page_name(gc): """Github (gh) Wiki page name from Google Code (gc) Wiki page name.""" + # FIXME: fails on all uppercase / all lowercase names (e.g. FAQ) gh = re.sub(r'([A-Z][a-z]+)', r'-\1', gc)[1:] return gh diff --git a/wikiconvert_creole.py b/wikiconvert_creole.py new file mode 100644 index 0000000..8594a36 --- /dev/null +++ b/wikiconvert_creole.py @@ -0,0 +1,172 @@ +#!/usr/bin/env python + +""" +Usage: + python googlecode2github/wikiconfig.py PROJID SRCDIR DSTDIR + +where "PROJID" is the github project id, e.g. "trentm/python-markdown2", +"SRCDIR" is a Google Code project wiki Subversion working copy dir and +"DSTDIR" is the git clone dir of the git project's wiki. +""" + +__version__ = "1.0.0" + +import re +import sys +from os.path import * +from glob import glob +from pprint import pprint +import codecs +from hashlib import md5 + + +def log(s): + sys.stderr.write(s+"\n") + +def convert_dir(proj_id, src_dir, dst_dir): + if isfile(src_dir): + convert_file(proj_id, src_dir, dst_dir) + else: + for f in glob(join(src_dir, "*.wiki")): + convert_file(proj_id, f, dst_dir) + +def convert_file(proj_id, src_path, dst_dir): + src = codecs.open(src_path, 'r', 'utf-8').read() + meta_lines = [] + body_lines = [] + lines = src.splitlines(False) + for i, line in enumerate(lines): + if line.startswith("#"): + meta_lines.append(line) + else: + assert not line.strip(), "line isn't empty in file %s %r" % (src_path, line) + # TODO is it actually mandtory that a blank line separate meta text from body text? + body_lines = lines[i+1:] + break + meta = {} + for line in meta_lines: + k,v = line[1:].split(None, 1) + meta[k] = v + text = '\n'.join(body_lines) + s_from_hash = {} + + # Pull out pre-blocks so we can restore them unmunged + def sub_block(match,indent=True): + pre = match.group(1) + hash = md5(pre.encode('utf8')).hexdigest() + s_from_hash[hash] = _indent(pre) if indent else pre + return hash + def sub_pre_block(match): + return sub_block(match,indent=True) + # TODO not sure newline is correct after opening braces {{{code}}} + text = re.compile(r'^{{{\n(.*?)^}}}', re.M|re.S).sub(sub_pre_block, text) + + # Pull out `backtick` code quotes + #def sub_code(match) + # return sub_block(match,indent=False) + text = re.compile(r'^`(.*?)^`', re.M|re.S).sub(r'##{{{\1}}}##', text), text) # monospace literal for Creole + + # Headings - No conversion needed for Creole. Markdown conversion below + #text = re.compile(r'^===(.*?)===\s*$', re.M).sub(lambda m: "### %s\n"%m.group(1).strip(), text) + #text = re.compile(r'^==(.*?)==\s*$', re.M).sub(lambda m: "## %s\n"%m.group(1).strip(), text) + #text = re.compile(r'^=(.*?)=\s*$', re.M).sub(lambda m: "# %s\n"%m.group(1).strip(), text) + + # Tables + def sub_table_html(m): + rows = [] + for line in m.group(0).splitlines(False): + if not line.strip(): + continue + rows.append(list(c.strip() for c in line.split("||")[1:-1])) + lines = [''] + for row in rows: + lines.append(' %s' % ''.join('' % c for c in row)) + lines.append('
%s
') + return '\n\n' + '\n'.join(lines) + def sub_table_creole(m): + rows = [] + for line in m.group(0).splitlines(False): + if not line.strip(): + continue + rows.append(list(c.strip() for c in line.split("||")[1:-1])) + lines = [] + # Assume first row is a header (or should we assume the reverse?) + lines.append('|='.join(row[0]))[:-1] # skip trailing equal sign + for row in rows[1:]: + lines.append('|'.join(row)) + return '\n\n' + '\n'.join(lines) + text = re.compile(r'\n(\n^\|\|(.*?\|\|)+$)+', re.M).sub(sub_table, text) + + # Lists (doesn't handle nested lists). + # TODO: leave bullet marker unchanged for *, -, + + text = re.compile(r'^[ \t]+\*[ \t]+(.*?)[ \t]*$', re.M).sub(r'* \1', text) + text = re.compile(r'^[ \t]+#[ \t]+(.*?)[ \t]*$', re.M).sub(r'1. \1', text) + + # wiki links. - Creole & Markdown are the same - no change required to conversion + def sub_wikilink(m): + gh_page_name = _gh_page_name_from_gc_page_name(m.group(1)).replace('-', ' ') + if m.group(2): + s = "[[%s|%s]]" % (gh_page_name, m.group(2)) + pass + else: + s = "[[%s]]" % gh_page_name + hash = md5(s.encode('utf8')).hexdigest() + s_from_hash[hash] = s + return hash + text = re.compile(r'\[((?:[A-Z][a-z]+)+)(?:\s+(.*?))?\]', re.S).sub(sub_wikilink, text) + + # Links + def sub_link(m): + # s = "[%s](%s)" % (m.group(2), m.group(1)) # Markdown + s = "[[%s|%s]]" % (m.group(1), m.group(2)) # Creole + hash = md5(s.encode('utf8')).hexdigest() + s_from_hash[hash] = s + return hash + text = re.compile(r'(? Github issue lookup map + text = re.compile(r'(? Date: Fri, 21 Dec 2012 18:19:02 -0500 Subject: [PATCH 02/16] First pass As used for first pass of automatic conversion. Still needs work --- wikiconvert_creole.py | 347 +++++++++++++++++++++--------------------- 1 file changed, 175 insertions(+), 172 deletions(-) diff --git a/wikiconvert_creole.py b/wikiconvert_creole.py index 8594a36..fa5a6ae 100644 --- a/wikiconvert_creole.py +++ b/wikiconvert_creole.py @@ -1,172 +1,175 @@ -#!/usr/bin/env python - -""" -Usage: - python googlecode2github/wikiconfig.py PROJID SRCDIR DSTDIR - -where "PROJID" is the github project id, e.g. "trentm/python-markdown2", -"SRCDIR" is a Google Code project wiki Subversion working copy dir and -"DSTDIR" is the git clone dir of the git project's wiki. -""" - -__version__ = "1.0.0" - -import re -import sys -from os.path import * -from glob import glob -from pprint import pprint -import codecs -from hashlib import md5 - - -def log(s): - sys.stderr.write(s+"\n") - -def convert_dir(proj_id, src_dir, dst_dir): - if isfile(src_dir): - convert_file(proj_id, src_dir, dst_dir) - else: - for f in glob(join(src_dir, "*.wiki")): - convert_file(proj_id, f, dst_dir) - -def convert_file(proj_id, src_path, dst_dir): - src = codecs.open(src_path, 'r', 'utf-8').read() - meta_lines = [] - body_lines = [] - lines = src.splitlines(False) - for i, line in enumerate(lines): - if line.startswith("#"): - meta_lines.append(line) - else: - assert not line.strip(), "line isn't empty in file %s %r" % (src_path, line) - # TODO is it actually mandtory that a blank line separate meta text from body text? - body_lines = lines[i+1:] - break - meta = {} - for line in meta_lines: - k,v = line[1:].split(None, 1) - meta[k] = v - text = '\n'.join(body_lines) - s_from_hash = {} - - # Pull out pre-blocks so we can restore them unmunged - def sub_block(match,indent=True): - pre = match.group(1) - hash = md5(pre.encode('utf8')).hexdigest() - s_from_hash[hash] = _indent(pre) if indent else pre - return hash - def sub_pre_block(match): - return sub_block(match,indent=True) - # TODO not sure newline is correct after opening braces {{{code}}} - text = re.compile(r'^{{{\n(.*?)^}}}', re.M|re.S).sub(sub_pre_block, text) - - # Pull out `backtick` code quotes - #def sub_code(match) - # return sub_block(match,indent=False) - text = re.compile(r'^`(.*?)^`', re.M|re.S).sub(r'##{{{\1}}}##', text), text) # monospace literal for Creole - - # Headings - No conversion needed for Creole. Markdown conversion below - #text = re.compile(r'^===(.*?)===\s*$', re.M).sub(lambda m: "### %s\n"%m.group(1).strip(), text) - #text = re.compile(r'^==(.*?)==\s*$', re.M).sub(lambda m: "## %s\n"%m.group(1).strip(), text) - #text = re.compile(r'^=(.*?)=\s*$', re.M).sub(lambda m: "# %s\n"%m.group(1).strip(), text) - - # Tables - def sub_table_html(m): - rows = [] - for line in m.group(0).splitlines(False): - if not line.strip(): - continue - rows.append(list(c.strip() for c in line.split("||")[1:-1])) - lines = [''] - for row in rows: - lines.append(' %s' % ''.join('' % c for c in row)) - lines.append('
%s
') - return '\n\n' + '\n'.join(lines) - def sub_table_creole(m): - rows = [] - for line in m.group(0).splitlines(False): - if not line.strip(): - continue - rows.append(list(c.strip() for c in line.split("||")[1:-1])) - lines = [] - # Assume first row is a header (or should we assume the reverse?) - lines.append('|='.join(row[0]))[:-1] # skip trailing equal sign - for row in rows[1:]: - lines.append('|'.join(row)) - return '\n\n' + '\n'.join(lines) - text = re.compile(r'\n(\n^\|\|(.*?\|\|)+$)+', re.M).sub(sub_table, text) - - # Lists (doesn't handle nested lists). - # TODO: leave bullet marker unchanged for *, -, + - text = re.compile(r'^[ \t]+\*[ \t]+(.*?)[ \t]*$', re.M).sub(r'* \1', text) - text = re.compile(r'^[ \t]+#[ \t]+(.*?)[ \t]*$', re.M).sub(r'1. \1', text) - - # wiki links. - Creole & Markdown are the same - no change required to conversion - def sub_wikilink(m): - gh_page_name = _gh_page_name_from_gc_page_name(m.group(1)).replace('-', ' ') - if m.group(2): - s = "[[%s|%s]]" % (gh_page_name, m.group(2)) - pass - else: - s = "[[%s]]" % gh_page_name - hash = md5(s.encode('utf8')).hexdigest() - s_from_hash[hash] = s - return hash - text = re.compile(r'\[((?:[A-Z][a-z]+)+)(?:\s+(.*?))?\]', re.S).sub(sub_wikilink, text) - - # Links - def sub_link(m): - # s = "[%s](%s)" % (m.group(2), m.group(1)) # Markdown - s = "[[%s|%s]]" % (m.group(1), m.group(2)) # Creole - hash = md5(s.encode('utf8')).hexdigest() - s_from_hash[hash] = s - return hash - text = re.compile(r'(? Github issue lookup map - text = re.compile(r'(?'] + for row in rows: + lines.append(' %s' % ''.join('%s' % c for c in row)) + lines.append('') + return '\n\n' + '\n'.join(lines) + def sub_table_creole(m): + rows = [] + for line in m.group(0).splitlines(False): + if not line.strip(): + continue + rows.append(list(c.strip() for c in line.split("||")[1:-1])) + lines = [] + # Assume first row is a header (or should we assume the reverse?) + if rows: + lines.append('|='.join(rows[0])[:-1] )# skip trailing equal sign + for row in rows[1:]: + lines.append('|'.join(row)) + return '\n\n' + '\n'.join(lines) +# text = re.compile(r'\n(\n^\|\|(.*?\|\|)+$)+', re.M).sub(sub_table, text) + text = re.compile(r'\n(\n^\|\|(.*?\|\|)+$)+', re.M).sub(sub_table_creole, text) + + # Lists (doesn't handle nested lists). + # TODO: leave bullet marker unchanged for *, -, + + text = re.compile(r'^[ \t]+\*[ \t]+(.*?)[ \t]*$', re.M).sub(r'* \1', text) + text = re.compile(r'^[ \t]+#[ \t]+(.*?)[ \t]*$', re.M).sub(r'1. \1', text) + + # wiki links. - Creole & Markdown are the same - no change required to conversion + def sub_wikilink(m): + gh_page_name = _gh_page_name_from_gc_page_name(m.group(1)).replace('-', ' ') + if m.group(2): + s = "[[%s|%s]]" % (gh_page_name, m.group(2)) + pass + else: + s = "[[%s]]" % gh_page_name + hash = md5(s.encode('utf8')).hexdigest() + s_from_hash[hash] = s + return hash + text = re.compile(r'\[((?:[A-Z][a-z]+)+)(?:\s+(.*?))?\]', re.S).sub(sub_wikilink, text) + + # Links + def sub_link(m): + # s = "[%s](%s)" % (m.group(2), m.group(1)) # Markdown + s = "[[%s|%s]]" % (m.group(1), m.group(2)) # Creole + hash = md5(s.encode('utf8')).hexdigest() + s_from_hash[hash] = s + return hash + text = re.compile(r'(? Github issue lookup map + text = re.compile(r'(? Date: Fri, 21 Dec 2012 18:46:45 -0500 Subject: [PATCH 03/16] Kill dead code and fix some bugs Remove unused code Fix tables, links, & bulletted lists --- wikiconvert_creole.py | 27 ++++++++------------------- 1 file changed, 8 insertions(+), 19 deletions(-) diff --git a/wikiconvert_creole.py b/wikiconvert_creole.py index fa5a6ae..f8e328b 100644 --- a/wikiconvert_creole.py +++ b/wikiconvert_creole.py @@ -66,23 +66,9 @@ def sub_pre_block(match): # return sub_block(match,indent=False) text = re.compile(r'^`(.*?)^`', re.M|re.S).sub(r'##{{{\1}}}##', text) # monospace literal for Creole - # Headings - No conversion needed for Creole. Markdown conversion below - #text = re.compile(r'^===(.*?)===\s*$', re.M).sub(lambda m: "### %s\n"%m.group(1).strip(), text) - #text = re.compile(r'^==(.*?)==\s*$', re.M).sub(lambda m: "## %s\n"%m.group(1).strip(), text) - #text = re.compile(r'^=(.*?)=\s*$', re.M).sub(lambda m: "# %s\n"%m.group(1).strip(), text) + # Headings - No conversion needed for Creole. # Tables - def sub_table_html(m): - rows = [] - for line in m.group(0).splitlines(False): - if not line.strip(): - continue - rows.append(list(c.strip() for c in line.split("||")[1:-1])) - lines = [''] - for row in rows: - lines.append(' %s' % ''.join('' % c for c in row)) - lines.append('
%s
') - return '\n\n' + '\n'.join(lines) def sub_table_creole(m): rows = [] for line in m.group(0).splitlines(False): @@ -92,16 +78,15 @@ def sub_table_creole(m): lines = [] # Assume first row is a header (or should we assume the reverse?) if rows: - lines.append('|='.join(rows[0])[:-1] )# skip trailing equal sign + lines.append('|=''+|='.join(rows[0])+'|') for row in rows[1:]: lines.append('|'.join(row)) return '\n\n' + '\n'.join(lines) -# text = re.compile(r'\n(\n^\|\|(.*?\|\|)+$)+', re.M).sub(sub_table, text) text = re.compile(r'\n(\n^\|\|(.*?\|\|)+$)+', re.M).sub(sub_table_creole, text) # Lists (doesn't handle nested lists). # TODO: leave bullet marker unchanged for *, -, + - text = re.compile(r'^[ \t]+\*[ \t]+(.*?)[ \t]*$', re.M).sub(r'* \1', text) +# text = re.compile(r'^[ \t]+\*[ \t]+(.*?)[ \t]*$', re.M).sub(r'* \1', text) text = re.compile(r'^[ \t]+#[ \t]+(.*?)[ \t]*$', re.M).sub(r'1. \1', text) # wiki links. - Creole & Markdown are the same - no change required to conversion @@ -120,7 +105,7 @@ def sub_wikilink(m): # Links def sub_link(m): # s = "[%s](%s)" % (m.group(2), m.group(1)) # Markdown - s = "[[%s|%s]]" % (m.group(1), m.group(2)) # Creole + s = "[[%s|%s]]" % (m.group(2), m.group(1)) # Creole hash = md5(s.encode('utf8')).hexdigest() s_from_hash[hash] = s return hash @@ -136,6 +121,10 @@ def sub_link(m): text = re.compile(r'(? Date: Fri, 21 Dec 2012 18:51:37 -0500 Subject: [PATCH 04/16] Fix typo in table conversion --- wikiconvert_creole.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wikiconvert_creole.py b/wikiconvert_creole.py index f8e328b..2377ce3 100644 --- a/wikiconvert_creole.py +++ b/wikiconvert_creole.py @@ -78,7 +78,7 @@ def sub_table_creole(m): lines = [] # Assume first row is a header (or should we assume the reverse?) if rows: - lines.append('|=''+|='.join(rows[0])+'|') + lines.append('|='+'|='.join(rows[0])+'|') for row in rows[1:]: lines.append('|'.join(row)) return '\n\n' + '\n'.join(lines) From ed358993b20b5905fb9717a5a7aad9395478a7ad Mon Sep 17 00:00:00 2001 From: Tom Morris Date: Fri, 21 Dec 2012 23:42:46 -0500 Subject: [PATCH 05/16] Fix table bodies too --- wikiconvert_creole.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wikiconvert_creole.py b/wikiconvert_creole.py index 2377ce3..f43ff7d 100644 --- a/wikiconvert_creole.py +++ b/wikiconvert_creole.py @@ -80,7 +80,7 @@ def sub_table_creole(m): if rows: lines.append('|='+'|='.join(rows[0])+'|') for row in rows[1:]: - lines.append('|'.join(row)) + lines.append('|'+'|'.join(row)+'|') return '\n\n' + '\n'.join(lines) text = re.compile(r'\n(\n^\|\|(.*?\|\|)+$)+', re.M).sub(sub_table_creole, text) From 73217379bca9a5a82d74e4d6775a267ecb8949b5 Mon Sep 17 00:00:00 2001 From: Tom Morris Date: Fri, 21 Dec 2012 23:59:10 -0500 Subject: [PATCH 06/16] Fix bulletted lists, add more TODOs --- wikiconvert_creole.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/wikiconvert_creole.py b/wikiconvert_creole.py index f43ff7d..6e2c97b 100644 --- a/wikiconvert_creole.py +++ b/wikiconvert_creole.py @@ -86,7 +86,7 @@ def sub_table_creole(m): # Lists (doesn't handle nested lists). # TODO: leave bullet marker unchanged for *, -, + -# text = re.compile(r'^[ \t]+\*[ \t]+(.*?)[ \t]*$', re.M).sub(r'* \1', text) + text = re.compile(r'^[ \t]+\*[ \t]+(.*?)[ \t]*$', re.M).sub(r' \1', text) text = re.compile(r'^[ \t]+#[ \t]+(.*?)[ \t]*$', re.M).sub(r'1. \1', text) # wiki links. - Creole & Markdown are the same - no change required to conversion @@ -125,10 +125,14 @@ def sub_link(m): # from http://groups.google.com/group/google-refine/ # to http://groups.google.com/group/openrefine - # Restore hashed-out blocks. + # TODO: replace Google Refine with OpenRefine everywyere + + # Restore hashed-out blocks. for hash, s in s_from_hash.items(): text = text.replace(hash, s) + # TODO remove or replace #summary header pragmas + # Add summary. if "summary" in meta: text = ("# %s\n\n" % meta["summary"]) + text From bef98984c96efbf8bc27c0501db1eb219d2517c6 Mon Sep 17 00:00:00 2001 From: Tom Morris Date: Sat, 22 Dec 2012 00:31:00 -0500 Subject: [PATCH 07/16] Swap order of handling for lists and bolding --- wikiconvert_creole.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/wikiconvert_creole.py b/wikiconvert_creole.py index 6e2c97b..1a36b16 100644 --- a/wikiconvert_creole.py +++ b/wikiconvert_creole.py @@ -84,9 +84,15 @@ def sub_table_creole(m): return '\n\n' + '\n'.join(lines) text = re.compile(r'\n(\n^\|\|(.*?\|\|)+$)+', re.M).sub(sub_table_creole, text) + + # Italics, bold. - same for both Markdown & Creole + # in*ter*bold: (?<=\w)(\*\w+?\*)(?=\w) + text = re.compile(r'(? Github issue lookup map From 51e1ee5c3db1b4da1f233dbc20ca4a8a67b20f17 Mon Sep 17 00:00:00 2001 From: Tom Morris Date: Sat, 22 Dec 2012 10:40:08 -0500 Subject: [PATCH 08/16] Fix bulletted lists for real Use a temporary marker so it doesn't conflict with bold processing --- wikiconvert_creole.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/wikiconvert_creole.py b/wikiconvert_creole.py index 1a36b16..53a8b16 100644 --- a/wikiconvert_creole.py +++ b/wikiconvert_creole.py @@ -84,16 +84,17 @@ def sub_table_creole(m): return '\n\n' + '\n'.join(lines) text = re.compile(r'\n(\n^\|\|(.*?\|\|)+$)+', re.M).sub(sub_table_creole, text) + # Lists (doesn't handle nested lists - flattends structure). + text = re.compile(r'^[ \t]+\*[ \t]+(.*?)$', re.M).sub(r'{^} \1', text) # temp marker to avoid bold processing + text = re.compile(r'^[ \t]+#[ \t]+(.*?)[ \t]*$', re.M).sub(r'1. \1', text) # Italics, bold. - same for both Markdown & Creole # in*ter*bold: (?<=\w)(\*\w+?\*)(?=\w) text = re.compile(r'(? Date: Sat, 22 Dec 2012 11:47:57 -0500 Subject: [PATCH 09/16] More fixes - Fix page name conversion to handle non-camel case - Fix up summary formatting - Change Google Refine to OpenRefine - Fix mailing list pointers --- wikiconvert_creole.py | 35 +++++++++++++++++------------------ 1 file changed, 17 insertions(+), 18 deletions(-) diff --git a/wikiconvert_creole.py b/wikiconvert_creole.py index 53a8b16..f18e044 100644 --- a/wikiconvert_creole.py +++ b/wikiconvert_creole.py @@ -29,7 +29,7 @@ def convert_dir(proj_id, src_dir, dst_dir): else: for f in glob(join(src_dir, "*.wiki")): convert_file(proj_id, f, dst_dir) - + def convert_file(proj_id, src_path, dst_dir): src = codecs.open(src_path, 'r', 'utf-8').read() meta_lines = [] @@ -56,8 +56,10 @@ def sub_block(match,indent=True): hash = md5(pre.encode('utf8')).hexdigest() s_from_hash[hash] = _indent(pre) if indent else pre return hash + def sub_pre_block(match): return sub_block(match,indent=True) + # TODO not sure newline is correct after opening braces {{{code}}} text = re.compile(r'^{{{\n(.*?)^}}}', re.M|re.S).sub(sub_pre_block, text) @@ -84,9 +86,9 @@ def sub_table_creole(m): return '\n\n' + '\n'.join(lines) text = re.compile(r'\n(\n^\|\|(.*?\|\|)+$)+', re.M).sub(sub_table_creole, text) - # Lists (doesn't handle nested lists - flattends structure). + # Lists (doesn't handle nested lists - flattens structure). text = re.compile(r'^[ \t]+\*[ \t]+(.*?)$', re.M).sub(r'{^} \1', text) # temp marker to avoid bold processing - text = re.compile(r'^[ \t]+#[ \t]+(.*?)[ \t]*$', re.M).sub(r'1. \1', text) + text = re.compile(r'^[ \t]+#[ \t]+(.*?)$', re.M).sub(r'1. \1', text) # Italics, bold. - same for both Markdown & Creole # in*ter*bold: (?<=\w)(\*\w+?\*)(?=\w) @@ -118,27 +120,23 @@ def sub_link(m): return hash text = re.compile(r'(? Github issue lookup map text = re.compile(r'(? Date: Sat, 22 Dec 2012 13:18:36 -0500 Subject: [PATCH 10/16] Fix links & code blocks - Allow inline code blocks (ie no newline required) - make http link matching less aggressive (may still have problems) - Move name substituion last so it fixes summaries, code examples, and links - Add debug code to warn when we failed to restore text blocks (still a few issues left) --- wikiconvert_creole.py | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/wikiconvert_creole.py b/wikiconvert_creole.py index f18e044..242a9fd 100644 --- a/wikiconvert_creole.py +++ b/wikiconvert_creole.py @@ -60,13 +60,12 @@ def sub_block(match,indent=True): def sub_pre_block(match): return sub_block(match,indent=True) - # TODO not sure newline is correct after opening braces {{{code}}} - text = re.compile(r'^{{{\n(.*?)^}}}', re.M|re.S).sub(sub_pre_block, text) + text = re.compile(r'^{{{(.*?)}}}', re.M|re.S).sub(sub_pre_block, text) # Pull out `backtick` code quotes #def sub_code(match) # return sub_block(match,indent=False) - text = re.compile(r'^`(.*?)^`', re.M|re.S).sub(r'##{{{\1}}}##', text) # monospace literal for Creole + text = re.compile(r'`(.*?)`', re.M|re.S).sub(r'##{{{\1}}}##', text) # monospace literal for Creole # Headings - No conversion needed for Creole. @@ -118,26 +117,31 @@ def sub_link(m): hash = md5(s.encode('utf8')).hexdigest() s_from_hash[hash] = s return hash - text = re.compile(r'(? Github issue lookup map text = re.compile(r'(? Date: Sat, 22 Dec 2012 13:53:49 -0500 Subject: [PATCH 11/16] Fix http & repo links - Fix label on http links - Map Google Code repo links to their Github equivalents --- wikiconvert_creole.py | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/wikiconvert_creole.py b/wikiconvert_creole.py index 242a9fd..e707588 100644 --- a/wikiconvert_creole.py +++ b/wikiconvert_creole.py @@ -118,7 +118,7 @@ def sub_link(m): s_from_hash[hash] = s return hash # NOTE: this only matches http & ftp links currently - text = re.compile(r'(? Github issue lookup map @@ -136,11 +136,14 @@ def sub_link(m): if "summary" in meta: text = ("//%s//\n\n" % meta["summary"]) + text - # Project specific replacements for naming, mailing lists, & issues + # Project specific replacements for naming, mailing lists, & code text = text.replace('Google Refine','OpenRefine') - text = text.replace('http://groups.google.com/group/google-refine', - 'http://groups.google.com/group/openrefine') - + text = text.replace( + 'http://groups.google.com/group/google-refine', + 'http://groups.google.com/group/openrefine') + text = text.replace( + 'code.google.com/p/google-refine/source/browse/trunk/', + 'github.com/OpenRefine/OpenRefine/blob/master/') base = splitext(basename(src_path))[0] gh_page_name = _gh_page_name_from_gc_page_name(base) From 3d8ceb5873f54ec361b03f38b743cd9bba61379c Mon Sep 17 00:00:00 2001 From: Tom Morris Date: Sat, 22 Dec 2012 14:15:53 -0500 Subject: [PATCH 12/16] Misc link fixes - exclude protocol from GC to Github link conversion so https works - remove ## surrounding inline code - fix intrawiki links --- wikiconvert_creole.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/wikiconvert_creole.py b/wikiconvert_creole.py index e707588..83efe13 100644 --- a/wikiconvert_creole.py +++ b/wikiconvert_creole.py @@ -65,7 +65,7 @@ def sub_pre_block(match): # Pull out `backtick` code quotes #def sub_code(match) # return sub_block(match,indent=False) - text = re.compile(r'`(.*?)`', re.M|re.S).sub(r'##{{{\1}}}##', text) # monospace literal for Creole + text = re.compile(r'`(.*?)`', re.M|re.S).sub(r'{{{\1}}}', text) # monospace literal for Creole # Headings - No conversion needed for Creole. @@ -97,12 +97,11 @@ def sub_table_creole(m): # Swap our temporary bulllet marker back out text = text.replace('{^}','*') - # wiki links. - Creole & Markdown are the same - no change required to conversion + # wiki links. def sub_wikilink(m): gh_page_name = _gh_page_name_from_gc_page_name(m.group(1)).replace('-', ' ') if m.group(2): - s = "[[%s|%s]]" % (gh_page_name, m.group(2)) - pass + s = "[[%s|%s]]" % ( m.group(2),gh_page_name) else: s = "[[%s]]" % gh_page_name hash = md5(s.encode('utf8')).hexdigest() @@ -139,8 +138,8 @@ def sub_link(m): # Project specific replacements for naming, mailing lists, & code text = text.replace('Google Refine','OpenRefine') text = text.replace( - 'http://groups.google.com/group/google-refine', - 'http://groups.google.com/group/openrefine') + '://groups.google.com/group/google-refine', + '://groups.google.com/group/openrefine') text = text.replace( 'code.google.com/p/google-refine/source/browse/trunk/', 'github.com/OpenRefine/OpenRefine/blob/master/') From 17d8d5668f83aae8036cf5e4aadcdd8c5631f88f Mon Sep 17 00:00:00 2001 From: Tom Morris Date: Sat, 22 Dec 2012 14:59:39 -0500 Subject: [PATCH 13/16] Fix code blocks & add warning - Fix code block syntax (wasn't using Creole) - Prepend warning to top of page to prompt human review --- wikiconvert_creole.py | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/wikiconvert_creole.py b/wikiconvert_creole.py index 83efe13..e518ffc 100644 --- a/wikiconvert_creole.py +++ b/wikiconvert_creole.py @@ -54,7 +54,8 @@ def convert_file(proj_id, src_path, dst_dir): def sub_block(match,indent=True): pre = match.group(1) hash = md5(pre.encode('utf8')).hexdigest() - s_from_hash[hash] = _indent(pre) if indent else pre + # Creole uses braces, not indentation for code blocks + s_from_hash[hash] = "{{{"+pre+"}}}" if indent else pre return hash def sub_pre_block(match): @@ -127,11 +128,20 @@ def sub_link(m): # Restore hashed-out blocks. for hash, s in s_from_hash.items(): if text == text.replace(hash,s): - print 'Contained = ',text.find(hash) print 'Failed to replace %s with %s' % (hash,s) text = text.replace(hash, s) + + base = splitext(basename(src_path))[0] + + # Prepend warning block for manual review + text = '----\n**NOTE**: This page was automatically converted from the ' \ + + ('[[old page|http://code.google.com/p/google-refine/wiki/%s]] ' % base) \ + + ' at Google Code and has not been manually reviewed. Please compare it to ' \ + + ('[[the original|http://code.google.com/p/google-refine/wiki/%s]] ' % base) \ + + ', correct any errors or omissions, then remove this warning block.\n----\n\n\n' \ + +text - # Prepend summary.(not sure whether h3 or italics is best option here) + # Prepend summary if "summary" in meta: text = ("//%s//\n\n" % meta["summary"]) + text @@ -144,9 +154,7 @@ def sub_link(m): 'code.google.com/p/google-refine/source/browse/trunk/', 'github.com/OpenRefine/OpenRefine/blob/master/') - base = splitext(basename(src_path))[0] gh_page_name = _gh_page_name_from_gc_page_name(base) -# dst_path = join(dst_dir, gh_page_name+".md") dst_path = join(dst_dir, gh_page_name+".creole") if not exists(dst_path) or codecs.open(dst_path, 'r', 'utf-8').read() != text: codecs.open(dst_path, 'w', 'utf-8').write(text) From 89be3ab72bd93bf0f1de59f6fa0f8895a1b2e146 Mon Sep 17 00:00:00 2001 From: Tom Morris Date: Sun, 23 Dec 2012 13:09:39 -0500 Subject: [PATCH 14/16] Fix numbered lists using correct Creole syntax --- wikiconvert_creole.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/wikiconvert_creole.py b/wikiconvert_creole.py index e518ffc..db9905e 100644 --- a/wikiconvert_creole.py +++ b/wikiconvert_creole.py @@ -88,7 +88,7 @@ def sub_table_creole(m): # Lists (doesn't handle nested lists - flattens structure). text = re.compile(r'^[ \t]+\*[ \t]+(.*?)$', re.M).sub(r'{^} \1', text) # temp marker to avoid bold processing - text = re.compile(r'^[ \t]+#[ \t]+(.*?)$', re.M).sub(r'1. \1', text) + text = re.compile(r'^[ \t]+#[ \t]+(.*?)$', re.M).sub(r'# \1', text) # Italics, bold. - same for both Markdown & Creole # in*ter*bold: (?<=\w)(\*\w+?\*)(?=\w) From fd0e858d388a62ba6ae70a46ffe46cb66cfe19ec Mon Sep 17 00:00:00 2001 From: Tom Morris Date: Wed, 26 Dec 2012 11:34:57 -0500 Subject: [PATCH 15/16] Improve page name conversion --- wikiconvert_creole.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/wikiconvert_creole.py b/wikiconvert_creole.py index db9905e..d30fba6 100644 --- a/wikiconvert_creole.py +++ b/wikiconvert_creole.py @@ -168,8 +168,9 @@ def _indent(text): def _gh_page_name_from_gc_page_name(gc): """Github (gh) Wiki page name from Google Code (gc) Wiki page name.""" - if re.match(r'[A-Z][a-z]{2,}',gc): - gh = re.sub(r'([A-Z][a-z]+)', r'-\1', gc)[1:] + # Note: We can't handle both FetchingURLsFromWebServices & GRELFunctions + if re.match(r'[A-Z]+[a-z]{2,}',gc): + gh = re.sub(r'([A-Z]+[a-z]+)', r'-\1', gc)[1:] else: gh = gc return gh From b9b1a220ad4ff17700b9839a62e5c79a615752e7 Mon Sep 17 00:00:00 2001 From: Ilya Lipnitskiy Date: Sat, 9 Nov 2013 01:11:22 -0800 Subject: [PATCH 16/16] Fix a few parsing bugs --- wikiconvert.py | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/wikiconvert.py b/wikiconvert.py index 683a625..97203b1 100644 --- a/wikiconvert.py +++ b/wikiconvert.py @@ -38,10 +38,8 @@ def convert_file(proj_id, src_path, dst_dir): for i, line in enumerate(lines): if line.startswith("#"): meta_lines.append(line) - else: - assert not line.strip(), "line isn't empty in file %s %r" % (src_path, line) - # TODO is it actually mandatory that a blank line separate meta text from body text? - body_lines = lines[i+1:] + elif line.strip(): + body_lines = lines[i:] break meta = {} for line in meta_lines: @@ -58,6 +56,7 @@ def convert_file(proj_id, src_path, dst_dir): text = re.compile(r'^}}}+ *(\n|$)', re.M).sub(r"```\n", text) # TODO: Add support for `backtick` code quotes + text = re.sub(r'{{{(.*?)}}}', r'`\1`', text) # Headings. text = re.compile(r'^===(.*?)===\s*$', re.M).sub(lambda m: "### %s\n"%m.group(1).strip(), text) @@ -131,12 +130,11 @@ def sub_link(m): #---- internal support stuff def _indent(text): - return ' ' + '\n '.join(text.splitlines(False)) + return '\n ' + '\n '.join(text.splitlines(False)) def _gh_page_name_from_gc_page_name(gc): """Github (gh) Wiki page name from Google Code (gc) Wiki page name.""" - # FIXME: fails on all uppercase / all lowercase names (e.g. FAQ) - gh = re.sub(r'([A-Z][a-z]+)', r'-\1', gc)[1:] + gh = re.sub(r'([A-Za-z]+)_?', r'-\1', gc)[1:] return gh