diff --git a/.gitignore b/.gitignore
index ce7a7cef..5a57b80d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -37,3 +37,8 @@ pip-log.txt
nosetests.xml
*.mo
.idea
+
+test.html
+testxml.html
+
+main.py
diff --git a/.travis.yml b/.travis.yml
index 6a5babb4..4251ba15 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -2,9 +2,13 @@ language: python
python:
- "2.6"
- "2.7"
-script: python main.py
+script: ./run_tests.sh
install:
+ - python setup.py -q install
- pip install -r requirements.txt
+env:
+ - TRAVIS_EXECUTE_PERFORMANCE=1
notifications:
email:
- jason.louard.ward@gmail.com
+ - samson91787@gmail.com
diff --git a/AUTHORS b/AUTHORS
new file mode 100644
index 00000000..81a14d38
--- /dev/null
+++ b/AUTHORS
@@ -0,0 +1,2 @@
+Sam Protnow
+Jason Ward
diff --git a/CHANGELOG b/CHANGELOG
new file mode 100644
index 00000000..d40440c9
--- /dev/null
+++ b/CHANGELOG
@@ -0,0 +1,74 @@
+
+Changelog
+=========
+* 0.3.13
+ * Significant performance gains for documents with a large number of table
+ cells.
+ * Significant performance gains for large documents.
+* 0.3.12
+ * Added command line support to convert from docx to either html or
+ markdown.
+* 0.3.11
+ * The non breaking hyphen tag was not correctly being imported. This issue
+ has been fixed.
+* 0.3.10
+ * Found and optimized a fairly large performance issue with tables that had
+ large amounts of content within a single cell, which includes nested
+ tables.
+* 0.3.9
+ * We are now respecting the `` element. We are putting a space in
+ everywhere they happen.
+ * Each styling can have a default defined based on values in `styles.xml`.
+ These default styles can be overwritten using the `rPr` on the actual `r`
+ tag. These default styles defined in `styles.xml` are actually being
+ respected now.
+* 0.3.8
+ * If zipfile fails to open the passed in file, we are now raising a
+ `MalformedDocxException` instead of a `BadZipFIle`.
+* 0.3.7
+ * Some inline tags (most notably the underline tag) could have a `val` of
+ `none` and that would signify that the style is disabled. A `val` of
+ `none` is now correctly handled.
+* 0.3.6
+ * It is possible for a docx file to not contain a `numbering.xml` file but
+ still try to use lists. Now if this happens all lists get converted to
+ paragraphs.
+* 0.3.5
+ * Not all docx files contain a `styles.xml` file. We are no longer assuming
+ they do.
+* 0.3.4
+ * It is possible for `w:t` tags to have `text` set to `None`. This no
+ longer causes an error when escaping that text.
+* 0.3.3
+ * In the event that `cElementTree` has a problem parsing the document, a
+ `MalformedDocxException` is raised instead of a `SyntaxError`
+* 0.3.2
+ * We were not taking into account that vertical merges should have a
+ continue attribute, but sometimes they do not, and in those cases word
+ assumes the continue attribute. We updated the parser to handle the
+ cases in which the continue attribute is not there.
+ * We now correctly handle documents with unicode character in the
+ namespace.
+ * In rare cases, some text would be output with a style when it should not
+ have been. This issue has been fixed.
+* 0.3.1
+ * Added support for several more OOXML tags including:
+ * caps
+ * smallCaps
+ * strike
+ * dstrike
+ * vanish
+ * webHidden
+ More details in the README.
+* 0.3.0
+ * We switched from using stock *xml.etree.ElementTree* to using
+ *xml.etree.cElementTree*. This has resulted in a fairly significant speed
+ increase for python 2.6
+ * It is now possible to create your own pre processor to do additional pre
+ processing.
+ * Superscripts and subscripts are now extracted correctly.
+* 0.2.1
+ * Added a changelog
+ * Added the version in pydocx.__init__
+ * Fixed an issue with duplicating content if there was indentation or
+ justification on a p element that had multiple t tags.
diff --git a/MANIFEST.in b/MANIFEST.in
new file mode 100644
index 00000000..88fbbf67
--- /dev/null
+++ b/MANIFEST.in
@@ -0,0 +1,7 @@
+include AUTHORS
+include CHANGELOG
+include LICENSE
+include MANIFEST.in
+include README.rst
+include pydocx/fixtures/*
+include pydocx/tests/templates/*
diff --git a/README.md b/README.md
deleted file mode 100644
index e3773551..00000000
--- a/README.md
+++ /dev/null
@@ -1,2 +0,0 @@
-pydocx
-======
\ No newline at end of file
diff --git a/README.rst b/README.rst
new file mode 100644
index 00000000..1bb9b3b1
--- /dev/null
+++ b/README.rst
@@ -0,0 +1,238 @@
+======
+pydocx
+======
+.. image:: https://travis-ci.org/CenterForOpenScience/pydocx.png?branch=master
+ :align: left
+ :target: https://travis-ci.org/CenterForOpenScience/pydocx
+
+pydocx is a parser that breaks down the elements of a docxfile and converts them
+into different markup languages. Right now, HTML is supported. Markdown and LaTex
+will be available soon. You can extend any of the available parsers to customize it
+to your needs. You can also create your own class that inherits DocxParser
+to create your own methods for a markup language not yet supported.
+
+Currently Supported
+###################
+
+* tables
+ * nested tables
+ * rowspans
+ * colspans
+ * lists in tables
+* lists
+ * list styles
+ * nested lists
+ * list of tables
+ * list of pragraphs
+* justification
+* images
+* styles
+ * bold
+ * italics
+ * underline
+ * hyperlinks
+* headings
+
+Usage
+#####
+
+DocxParser includes abstracts methods that each parser overwrites to satsify its own needs. The abstract methods are as follows:
+
+::
+
+ class DocxParser:
+
+ @property
+ def parsed(self):
+ return self._parsed
+
+ @property
+ def escape(self, text):
+ return text
+
+ @abstractmethod
+ def linebreak(self):
+ return ''
+
+ @abstractmethod
+ def paragraph(self, text):
+ return text
+
+ @abstractmethod
+ def heading(self, text, heading_level):
+ return text
+
+ @abstractmethod
+ def insertion(self, text, author, date):
+ return text
+
+ @abstractmethod
+ def hyperlink(self, text, href):
+ return text
+
+ @abstractmethod
+ def image_handler(self, path):
+ return path
+
+ @abstractmethod
+ def image(self, path, x, y):
+ return self.image_handler(path)
+
+ @abstractmethod
+ def deletion(self, text, author, date):
+ return text
+
+ @abstractmethod
+ def bold(self, text):
+ return text
+
+ @abstractmethod
+ def italics(self, text):
+ return text
+
+ @abstractmethod
+ def underline(self, text):
+ return text
+
+ @abstractmethod
+ def superscript(self, text):
+ return text
+
+ @abstractmethod
+ def subscript(self, text):
+ return text
+
+ @abstractmethod
+ def tab(self):
+ return True
+
+ @abstractmethod
+ def ordered_list(self, text):
+ return text
+
+ @abstractmethod
+ def unordered_list(self, text):
+ return text
+
+ @abstractmethod
+ def list_element(self, text):
+ return text
+
+ @abstractmethod
+ def table(self, text):
+ return text
+ @abstractmethod
+ def table_row(self, text):
+ return text
+
+ @abstractmethod
+ def table_cell(self, text):
+ return text
+
+ @abstractmethod
+ def page_break(self):
+ return True
+
+ @abstractmethod
+ def indent(self, text, left='', right='', firstLine=''):
+ return text
+
+Docx2Html inherits DocxParser and implements basic HTML handling. Ex.
+
+::
+
+ class Docx2Html(DocxParser):
+
+ # Escape '&', '<', and '>' so we render the HTML correctly
+ def escape(self, text):
+ return xml.sax.saxutils.quoteattr(text)[1:-1]
+
+ # return a line break
+ def linebreak(self, pre=None):
+ return '
'
+
+ # add paragraph tags
+ def paragraph(self, text, pre=None):
+ return '' + text + '
'
+
+
+However, let's say you want to add a specific style to your HTML document. In order to do this, you want to make each paragraph a class of type `my_implementation`. Simply extend docx2Html and add what you need.
+
+::
+
+ class My_Implementation_of_Docx2Html(Docx2Html):
+
+ def paragraph(self, text, pre = None):
+ return + text + '
'
+
+
+
+OR, let's say FOO is your new favorite markup language. Simply customize your own new parser, overwritting the abstract methods of DocxParser
+
+::
+
+ class Docx2Foo(DocxParser):
+
+ # because linebreaks in are denoted by '!!!!!!!!!!!!' with the FOO markup langauge :)
+ def linebreak(self):
+ return '!!!!!!!!!!!!'
+
+Custom Pre-Processor
+####################
+
+When creating your own Parser (as described above) you can now add in your own custom Pre Processor. To do so you will need to set the `pre_processor` field on the custom parser, like so:
+
+::
+
+ class Docx2Foo(DocxParser):
+ pre_processor_class = FooPreProcessor
+
+
+The `FooPreProcessor` will need a few things to get you going:
+
+::
+
+ class FooPreProcessor(PydocxPreProcessor):
+ def perform_pre_processing(self, root, *args, **kwargs):
+ super(FooPreProcessor, self).perform_pre_processing(root, *args, **kwargs)
+ self._set_foo(root)
+
+ def _set_foo(self, root):
+ pass
+
+If you want `_set_foo` to be called you must add it to `perform_pre_processing` which is called in the base parser for pydocx.
+
+Everything done during pre-processing is executed prior to `parse` being called for the first time.
+
+
+Styles
+######
+
+The base parser `Docx2Html` relies on certain css class being set for certain behaviour to occur. Currently these include:
+
+* class `pydocx-insert` -> Turns the text green.
+* class `pydocx-delete` -> Turns the text red and draws a line through the text.
+* class `pydocx-center` -> Aligns the text to the center.
+* class `pydocx-right` -> Aligns the text to the right.
+* class `pydocx-left` -> Aligns the text to the left.
+* class `pydocx-comment` -> Turns the text blue.
+* class `pydocx-underline` -> Underlines the text.
+* class `pydocx-caps` -> Makes all text uppercase.
+* class `pydocx-small-caps` -> Makes all text uppercase, however truly lowercase letters will be small than their uppercase counterparts.
+* class `pydocx-strike` -> Strike a line through.
+* class `pydocx-hidden` -> Hide the text.
+
+Exceptions
+##########
+
+Right now there is only one custom exception (`MalformedDocxException`). It is raised if either the `xml` or `zipfile` libraries raise an exception.
+
+Optional Arguments
+##################
+
+You can pass in `convert_root_level_upper_roman=True` to the parser and it will convert all root level upper roman lists to headings instead.
+
+Command Line Execution
+######################
+
+First you have to install pydocx, this can be done by running the command `pip install pydocx`. From there you can simply call the command `pydocx --html path/to/file.docx path/to/output.html`. Change `pydocx --html` to `pydocx --markdown` in order to convert to markdown instead.
diff --git a/main.py b/main.py
deleted file mode 100644
index c9e8e1d4..00000000
--- a/main.py
+++ /dev/null
@@ -1,12 +0,0 @@
-from pydocx import *
-from bs4 import BeautifulSoup
-import xml.etree.ElementTree as ElementTree
-#import lxml.etree as etree
-
-with open('test.html', 'w') as f:
- f.write(docx2html('helloworld.docx'))
-with open('testxml.html','w') as f:
- f.write(BeautifulSoup(ElementTree.tostring(Docx2Html('helloworld.docx').root)).prettify())
-
-#print docx2html('helloworld.docx')
-#print docx2markdown('helloworld.docx')
\ No newline at end of file
diff --git a/pydocx/DocxParser.py b/pydocx/DocxParser.py
index b3006ef0..fb08b180 100644
--- a/pydocx/DocxParser.py
+++ b/pydocx/DocxParser.py
@@ -1,323 +1,715 @@
-from abc import abstractmethod, ABCMeta
-import zipfile
import logging
-import xml.etree.ElementTree as ElementTree
-from xml.etree.ElementTree import _ElementInterface
+import os
+import zipfile
+
+from abc import abstractmethod, ABCMeta
+from contextlib import contextmanager
+
+from pydocx.utils import (
+ MulitMemoizeMixin,
+ PydocxPreProcessor,
+ find_all,
+ find_ancestor_with_tag,
+ find_first,
+ get_list_style,
+ has_descendant_with_tag,
+ parse_xml_from_string,
+)
+from pydocx.exceptions import MalformedDocxException
logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger("NewParser")
-def remove_namespaces(document):
- root = ElementTree.fromstring(document)
- for child in el_iter(root):
- child.tag = child.tag.split("}")[1]
- child.attrib = dict(
- (k.split("}")[1], v)
- for k, v in child.attrib.items()
- )
- return ElementTree.tostring(root)
-
-# Add some helper functions to Element to make it slightly more readable
-
-
-def has_child(self, tag):
- return True if self.find(tag) is not None else False
-
-
-def has_child_all(self, tag):
- return True if self.find('.//' + tag) is not None else False
+# http://openxmldeveloper.org/discussions/formats/f/15/p/396/933.aspx
+EMUS_PER_PIXEL = 9525
+USE_ALIGNMENTS = True
+JUSTIFY_CENTER = 'center'
+JUSTIFY_LEFT = 'left'
+JUSTIFY_RIGHT = 'right'
-def find_all(self, tag):
- return self.find('.//' + tag)
+INDENTATION_RIGHT = 'right'
+INDENTATION_LEFT = 'left'
+INDENTATION_FIRST_LINE = 'firstLine'
+DISABLED_STYLE_VALUES = ['false', '0', 'none']
-
-def findall_all(self, tag):
- return self.findall('.//' + tag)
+# Add some helper functions to Element to make it slightly more readable
-def el_iter(el):
+@contextmanager
+def ZipFile(path): # This is not needed in python 3.2+
try:
- return el.iter()
- except AttributeError:
- return el.findall('.//*')
-
-
-setattr(_ElementInterface, 'has_child', has_child)
-setattr(_ElementInterface, 'has_child_all', has_child_all)
-setattr(_ElementInterface, 'find_all', find_all)
-setattr(_ElementInterface, 'findall_all', findall_all)
-setattr(_ElementInterface, 'parent', None)
-setattr(_ElementInterface, 'parent_list', [])
-
-# End helpers
+ f = zipfile.ZipFile(path)
+ except zipfile.BadZipfile:
+ raise MalformedDocxException('Passed in document is not a docx')
+ yield f
+ f.close()
-class DocxParser:
+class DocxParser(MulitMemoizeMixin):
__metaclass__ = ABCMeta
+ pre_processor_class = PydocxPreProcessor
- def __init__(self, path):
- self._parsed = ''
- self.in_list = False
-
- f = zipfile.ZipFile(path)
+ def _extract_xml(self, f, xml_path):
try:
- self.document_text = f.read('word/document.xml')
- try:
- self.numbering_text = f.read('word/numbering.xml')
- except zipfile.BadZipfile:
- pass
- try:
- self.comment_text = f.read('word/comments.xml')
- except zipfile.BadZipfile:
- pass
- finally:
- f.close()
-
- self.root = ElementTree.fromstring(
- remove_namespaces(self.document_text),
- )
-
- def add_parent(el):
- for child in el.getchildren():
- setattr(child, 'parent', el)
- add_parent(child)
- add_parent(self.root)
-
- def create_parent_list(el, tmp=None):
- if tmp is None:
- tmp = []
- for child in el:
- tmp.append(el)
- tmp = create_parent_list(child, tmp)
- el.parent_list = tmp[:]
- try:
- tmp.pop()
- except:
- tmp = []
- return tmp
-
- create_parent_list(self.root)
+ return f.read(xml_path)
+ except KeyError:
+ return None
+ def _build_data(self, path, *args, **kwargs):
+ with ZipFile(path) as f:
+ # These must be in the ZIP in order for the docx to be valid.
+ self.document_text = f.read('word/document.xml')
+ self.relationship_text = f.read('word/_rels/document.xml.rels')
+
+ # These are all optional.
+ self.styles_text = self._extract_xml(f, 'word/styles.xml')
+ self.fonts = self._extract_xml(f, 'word/fontTable.xml')
+ self.numbering_text = self._extract_xml(f, 'word/numbering.xml')
+ self.comment_text = self._extract_xml(f, 'word/comments.xml')
+
+ zipped_image_files = [
+ e for e in f.infolist()
+ if e.filename.startswith('word/media/')
+ ]
+ for e in zipped_image_files:
+ self._image_data[e.filename] = f.read(e.filename)
+
+ self.root = parse_xml_from_string(self.document_text)
+ self.numbering_root = None
+ if self.numbering_text:
+ self.numbering_root = parse_xml_from_string(self.numbering_text)
+ self.comment_root = None
+ if self.comment_text:
+ self.comment_root = parse_xml_from_string(self.comment_text)
+
+ def _parse_run_properties(self, rPr):
+ """
+ Takes an `rPr` and returns a dictionary contain the tag name mapped to
+ the child's value property.
+
+ If you have an rPr that looks like this:
+
+
+
+
+
+
+ That will result in a dictionary that looks like this:
+ {
+ 'b': '',
+ 'u': 'false',
+ 'sz': '16',
+ }
+ """
+ run_properties = {}
+ if rPr is None:
+ return {}
+ for run_property in rPr:
+ val = run_property.get('val', '').lower()
+ run_properties[run_property.tag] = val
+ return run_properties
+
+ def _parse_styles(self):
+ if self.styles_text is None:
+ return {}
+ tree = parse_xml_from_string(self.styles_text)
+ styles_dict = {}
+ for style in find_all(tree, 'style'):
+ style_val = find_first(style, 'name').attrib['val']
+ run_properties = find_first(style, 'rPr')
+ styles_dict[style.attrib['styleId']] = {
+ 'style_name': style_val,
+ 'default_run_properties': self._parse_run_properties(
+ run_properties,
+ ),
+ }
+ return styles_dict
+
+ def _parse_rels_root(self):
+ tree = parse_xml_from_string(self.relationship_text)
+ rels_dict = {}
+ for el in tree:
+ rId = el.get('Id')
+ target = el.get('Target')
+ rels_dict[rId] = target
+ return rels_dict
+
+ def __init__(
+ self,
+ path,
+ convert_root_level_upper_roman=False,
+ *args,
+ **kwargs):
+ self._parsed = ''
+ self.block_text = ''
+ self.page_width = 0
+ self.convert_root_level_upper_roman = convert_root_level_upper_roman
+ self._image_data = {}
+ self._build_data(path, *args, **kwargs)
+ self.pre_processor = None
+
+ #divide by 20 to get to pt (Office works in 20th's of a point)
+ """
+ see http://msdn.microsoft.com/en-us/library/documentformat
+ .openxml.wordprocessing.indentation.aspx
+ """
+ if find_first(self.root, 'pgSz') is not None:
+ self.page_width = int(
+ find_first(self.root, 'pgSz').attrib['w']
+ ) / 20
+
+ #all blank when we init
self.comment_store = None
- self.numbering_store = None
- self.ignore_current = False
- self.elements = []
- self.tables_seen = []
- self.visited = []
- try:
- self.numbering_root = ElementTree.fromstring(
- remove_namespaces(self.numbering_text),
- )
- except:
- pass
- self.parse_begin(self.root)
+ self.visited = set()
+ self.list_depth = 0
+ self.rels_dict = self._parse_rels_root()
+ self.styles_dict = self._parse_styles()
+ self.parse_begin(self.root) # begin to parse
def parse_begin(self, el):
- self._parsed += self.parse_lists(el)
-
-### parse table function and is_table flag
- def parse_lists(self, el):
- parsed = ''
- first_p = el.find_all('p')
- children = []
- for child in first_p.parent:
- if child.tag == 'p' or child.tag == 'tbl':
- children.append(child)
- p_list = children
- list_started = False
- list_type = ''
- list_chunks = []
- index_start = 0
- index_end = 1
- for i, el in enumerate(p_list):
- if not list_started and el.has_child_all('ilvl'):
- list_started = True
- list_type = self.get_list_style(
- el.find_all('numId').attrib['val'],
- )
- list_chunks.append(p_list[index_start:index_end])
- index_start = i
- index_end = i+1
- elif (
- list_started and
- el.has_child_all('ilvl') and
- not list_type == self.get_list_style(
- el.find_all('numId').attrib['val']
- )):
- list_type = self.get_list_style(
- el.find_all('numId').attrib['val'],
- )
- list_started = True
- list_chunks.append(p_list[index_start:index_end])
- index_start = i
- index_end = i+1
- elif list_started and not el.has_child_all('ilvl'):
- list_started = False
- list_chunks.append(p_list[index_start:index_end])
- index_start = i
- index_end = i+1
- else:
- index_end = i+1
- list_chunks.append(p_list[index_start:index_end])
- for chunk in list_chunks:
- chunk_parsed = ''
- for el in chunk:
- chunk_parsed += self.parse(el)
- if chunk[0].has_child_all('ilvl'):
- lst_style = self.get_list_style(
- chunk[0].find_all('numId').attrib['val'],
- )
- if lst_style['val'] == 'bullet':
- parsed += self.unordered_list(chunk_parsed)
- else:
- parsed += self.ordered_list(chunk_parsed)
- elif chunk[0].has_child_all('br'):
- parsed += self.page_break()
- else:
- parsed += chunk_parsed
-
- return parsed
+ self.populate_memoization({
+ 'find_all': find_all,
+ 'find_first': find_first,
+ 'has_descendant_with_tag': has_descendant_with_tag,
+ '_get_tcs_in_column': self._get_tcs_in_column,
+ })
+ self.pre_processor = self.pre_processor_class(
+ convert_root_level_upper_roman=self.convert_root_level_upper_roman,
+ styles_dict=self.styles_dict,
+ numbering_root=self.numbering_root,
+ )
+ self.pre_processor.perform_pre_processing(el)
+ self._parsed += self.parse(el)
def parse(self, el):
+ if el in self.visited:
+ return ''
+ self.visited.add(el)
parsed = ''
- if not self.ignore_current:
- tmp_d = dict(
- (tmpel.tag, i)
- for i, tmpel in enumerate(el.parent_list)
- )
- if (
- 'tbl' in tmp_d and
- el.parent_list[tmp_d['tbl']] not in self.tables_seen):
- self.ignore_current = True
- self.tables_seen.append(el.parent_list[tmp_d['tbl']])
- tmpout = self.table(self.parse(el.parent_list[tmp_d['tbl']]))
- self.ignore_current = False
- return tmpout
-
for child in el:
+ # recursive. So you can get all the way to the bottom
parsed += self.parse(child)
-
- if el.tag == 'br' and el.attrib['type'] == 'page':
- #TODO figure out what parsed is getting overwritten
- return self.page_break()
- # add it to the list so we don't repeat!
- if el.tag == 'ilvl' and el not in self.visited:
- self.in_list = True
- self.visited.append(el)
- ## This starts the returns
+ if el.tag == 'br' and el.attrib.get('type') == 'page':
+ return self.parse_page_break(el, parsed)
+ # page breaks use lastRenderedPageBreak in MS Word > 2007
+ elif el.tag == 'lastRenderedPageBreak':
+ return self.parse_page_break(el, parsed)
+ elif el.tag == 'tbl':
+ return self.parse_table(el, parsed)
elif el.tag == 'tr':
- return self.table_row(parsed)
+ return self.parse_table_row(el, parsed)
elif el.tag == 'tc':
- self.elements.append(el)
- return self.table_cell(parsed)
- if el.tag == 'r' and el not in self.elements:
- self.elements.append(el)
- return self.parse_r(el)
+ return self.parse_table_cell(el, parsed)
+ elif el.tag == 'r':
+ return self.parse_r(el, parsed)
+ elif el.tag == 't':
+ return self.parse_t(el, parsed)
+ elif el.tag == 'tab':
+ return self.parse_tab(el, parsed)
+ elif el.tag == 'noBreakHyphen':
+ return self.parse_hyphen(el, parsed)
+ elif el.tag == 'br':
+ return self.parse_break_tag(el, parsed)
+ elif el.tag == 'delText':
+ return self.parse_deletion(el, parsed)
elif el.tag == 'p':
return self.parse_p(el, parsed)
elif el.tag == 'ins':
- return self.insertion(parsed, '', '')
+ return self.parse_insertion(el, parsed)
+ elif el.tag == 'hyperlink':
+ return self.parse_hyperlink(el, parsed)
+ elif el.tag in ('pict', 'drawing'):
+ return self.parse_image(el)
+ else:
+ return parsed
+
+ def parse_page_break(self, el, text):
+ #TODO figure out what parsed is getting overwritten
+ return self.page_break()
+
+ def parse_table(self, el, text):
+ return self.table(text)
+
+ def parse_table_row(self, el, text):
+ return self.table_row(text)
+
+ def parse_table_cell(self, el, text):
+ v_merge = find_first(el, 'vMerge')
+ if v_merge is not None and (
+ 'restart' != v_merge.get('val', '')):
+ return ''
+ colspan = self.get_colspan(el)
+ rowspan = self._get_rowspan(el, v_merge)
+ if rowspan > 1:
+ rowspan = str(rowspan)
+ else:
+ rowspan = ''
+ return self.table_cell(text, colspan, rowspan)
+
+ def parse_list(self, el, text):
+ """
+ All the meat of building the list is done in _parse_list, however we
+ call this method for two reasons: It is the naming convention we are
+ following. And we need a reliable way to raise and lower the list_depth
+ (which is used to determine if we are in a list). I could have done
+ this in _parse_list, however it seemed cleaner to do it here.
+ """
+ self.list_depth += 1
+ parsed = self._parse_list(el, text)
+ self.list_depth -= 1
+ if self.pre_processor.is_in_table(el):
+ return self.parse_table_cell_contents(el, parsed)
+ return parsed
+
+ def get_list_style(self, num_id, ilvl):
+ return get_list_style(self.numbering_root, num_id, ilvl)
+
+ def _build_list(self, el, text):
+ # Get the list style for the pending list.
+ lst_style = self.get_list_style(
+ self.pre_processor.num_id(el).num_id,
+ self.pre_processor.ilvl(el),
+ )
+
+ parsed = text
+ # Create the actual list and return it.
+ if lst_style == 'bullet':
+ return self.unordered_list(parsed)
else:
+ return self.ordered_list(
+ parsed,
+ lst_style,
+ )
+
+ def _parse_list(self, el, text):
+ parsed = self.parse_list_item(el, text)
+ num_id = self.pre_processor.num_id(el)
+ ilvl = self.pre_processor.ilvl(el)
+ # Everything after this point assumes the first element is not also the
+ # last. If the first element is also the last then early return by
+ # building and returning the completed list.
+ if self.pre_processor.is_last_list_item_in_root(el):
+ return self._build_list(el, parsed)
+ next_el = self.pre_processor.next(el)
+
+ def is_same_list(next_el, num_id, ilvl):
+ # Bail if next_el is not an element
+ if next_el is None:
+ return False
+ if self.pre_processor.is_last_list_item_in_root(next_el):
+ return False
+ # If next_el is not a list item then roll it into the list by
+ # returning True.
+ if not self.pre_processor.is_list_item(next_el):
+ return True
+ if self.pre_processor.num_id(next_el) != num_id:
+ # The next element is a new list entirely
+ return False
+ if self.pre_processor.ilvl(next_el) < ilvl:
+ # The next element is de-indented, so this is really the last
+ # element in the list
+ return False
+ return True
+
+ while is_same_list(next_el, num_id, ilvl):
+ if next_el in self.visited:
+ # Early continue for elements we have already visited.
+ next_el = self.pre_processor.next(next_el)
+ continue
+
+ if self.pre_processor.is_list_item(next_el):
+ # Reset the ilvl
+ ilvl = self.pre_processor.ilvl(next_el)
+
+ parsed += self.parse(next_el)
+ next_el = self.pre_processor.next(next_el)
+
+ def should_parse_last_el(last_el, first_el):
+ if last_el is None:
+ return False
+ # Different list
+ if (
+ self.pre_processor.num_id(last_el) !=
+ self.pre_processor.num_id(first_el)):
+ return False
+ # Will be handled when the ilvls do match (nesting issue)
+ if (
+ self.pre_processor.ilvl(last_el) !=
+ self.pre_processor.ilvl(first_el)):
+ return False
+ # We only care about last items that have not been parsed before
+ # (first list items are always parsed at the beginning of this
+ # method.)
+ return (
+ not self.pre_processor.is_first_list_item(last_el) and
+ self.pre_processor.is_last_list_item_in_root(last_el)
+ )
+ if should_parse_last_el(next_el, el):
+ parsed += self.parse(next_el)
+
+ # If the list has no content, then we don't need to worry about the
+ # list styling, because it will be stripped out.
+ if parsed == '':
return parsed
+ return self._build_list(el, parsed)
+
+ def justification(self, el, text):
+ paragraph_tag_property = el.find('pPr')
+ if paragraph_tag_property is None:
+ return text
+
+ _justification = paragraph_tag_property.find('jc')
+ indentation = paragraph_tag_property.find('ind')
+ if _justification is None and indentation is None:
+ return text
+ alignment = None
+ right = None
+ left = None
+ firstLine = None
+ if _justification is not None: # text alignments
+ value = _justification.attrib['val']
+ if value in [JUSTIFY_LEFT, JUSTIFY_CENTER, JUSTIFY_RIGHT]:
+ alignment = value
+
+ if indentation is not None:
+ if INDENTATION_RIGHT in indentation.attrib:
+ right = indentation.attrib[INDENTATION_RIGHT]
+ # divide by 20 to get to pt. multiply by (4/3) to get to px
+ right = (int(right) / 20) * float(4) / float(3)
+ right = str(right)
+ if INDENTATION_LEFT in indentation.attrib:
+ left = indentation.attrib[INDENTATION_LEFT]
+ left = (int(left) / 20) * float(4) / float(3)
+ left = str(left)
+ if INDENTATION_FIRST_LINE in indentation.attrib:
+ firstLine = indentation.attrib[INDENTATION_FIRST_LINE]
+ firstLine = (int(firstLine) / 20) * float(4) / float(3)
+ firstLine = str(firstLine)
+ if any([alignment, firstLine, left, right]):
+ return self.indent(text, alignment, firstLine, left, right)
+ return text
+
def parse_p(self, el, text):
+ if text == '':
+ return ''
+ # TODO This is still not correct, however it fixes the bug. We need to
+ # apply the classes/styles on p, td, li and h tags instead of inline,
+ # but that is for another ticket.
+ text = self.justification(el, text)
+ if self.pre_processor.is_first_list_item(el):
+ return self.parse_list(el, text)
+ if self.pre_processor.heading_level(el):
+ return self.parse_heading(el, text)
+ if self.pre_processor.is_list_item(el):
+ return self.parse_list_item(el, text)
+ if self.pre_processor.is_in_table(el):
+ return self.parse_table_cell_contents(el, text)
parsed = text
- if self.in_list:
- self.in_list = False
- parsed = self.list_element(parsed)
- elif (
- not el.has_child_all('t') and
- 'tbl' not in [i.tag for i in el.parent_list]):
- parsed = self.linebreak()
- elif el.parent not in self.elements:
+ # No p tags in li tags
+ if self.list_depth == 0:
parsed = self.paragraph(parsed)
return parsed
- def parse_r(self, el):
- is_deleted = False
- text = None
- if el.has_child('t'):
- text = self.escape(el.find('t').text)
- elif el.has_child('delText'):
- text = self.escape(el.find('delText').text)
- is_deleted = True
- if text:
- rpr = el.find('rPr')
- if rpr is not None:
- fns = []
- if rpr.has_child('b'):
- fns.append(self.bold)
- if rpr.has_child('i'):
- fns.append(self.italics)
- if rpr.has_child('u'):
- fns.append(self.underline)
- for fn in fns:
- text = fn(text)
- ppr = el.parent.find('pPr')
- if ppr is not None:
- jc = ppr.find('jc')
- if jc is not None:
- if jc.attrib['val'] == 'right':
- text = self.right_justify(text)
- if jc.attrib['val'] == 'center':
- text = self.center_justify(text)
- ind = ppr.find('ind')
- if ind is not None:
- right = None
- left = None
- firstLine = None
- if 'right' in ind.attrib:
- right = ind.attrib['right']
- right = int(right)/20
- right = str(right)
- if 'left' in ind.attrib:
- left = ind.attrib['left']
- left = int(left)/20
- left = str(left)
- if 'firstLine' in ind.attrib:
- firstLine = ind.attrib['firstLine']
- firstLine = int(firstLine)/20
- firstLine = str(firstLine)
- text = self.indent(text, right, left, firstLine)
- if is_deleted:
- text = self.deletion(text, '', '')
+ def _should_append_break_tag(self, next_el):
+ paragraph_like_tags = [
+ 'p',
+ ]
+ inline_like_tags = [
+ 'smartTag',
+ 'ins',
+ 'delText',
+ ]
+ if self.pre_processor.is_list_item(next_el):
+ return False
+ if self.pre_processor.previous(next_el) is None:
+ return False
+ tag_is_inline_like = any(
+ self.memod_tree_op('has_descendant_with_tag', next_el, tag) for
+ tag in inline_like_tags
+ )
+ if tag_is_inline_like:
+ return False
+ if (
+ self.pre_processor.is_last_list_item_in_root(
+ self.pre_processor.previous(next_el))):
+ return False
+ if self.pre_processor.previous(next_el).tag not in paragraph_like_tags:
+ return False
+ if next_el.tag not in paragraph_like_tags:
+ return False
+ return True
+
+ def parse_heading(self, el, parsed):
+ return self.heading(parsed, self.pre_processor.heading_level(el))
+
+ def parse_list_item(self, el, text):
+ # If for whatever reason we are not currently in a list, then start
+ # a list here. This will only happen if the num_id/ilvl combinations
+ # between lists is not well formed.
+ parsed = text
+ if self.list_depth == 0:
+ return self.parse_list(el, parsed)
+
+ def _should_parse_next_as_content(el):
+ """
+ Get the contents of the next el and append it to the
+ contents of the current el (that way things like tables
+ are actually in the li tag instead of in the ol/ul tag).
+ """
+ next_el = self.pre_processor.next(el)
+ if next_el is None:
+ return False
+ if (
+ not self.pre_processor.is_list_item(next_el) and
+ not self.pre_processor.is_last_list_item_in_root(el)
+ ):
+ return True
+ if self.pre_processor.is_first_list_item(next_el):
+ if (
+ self.pre_processor.num_id(next_el) ==
+ self.pre_processor.num_id(el)):
+ return True
+ return False
+
+ while el is not None:
+ if _should_parse_next_as_content(el):
+ el = self.pre_processor.next(el)
+ next_elements_content = self.parse(el)
+ if not next_elements_content:
+ continue
+ if self._should_append_break_tag(el):
+ parsed += self.break_tag()
+ parsed += next_elements_content
+ else:
+ break
+ # Create the actual li element
+ return self.list_element(parsed)
+
+ def _get_tcs_in_column(self, tbl, column_index):
+ return [
+ tc for tc in self.memod_tree_op('find_all', tbl, 'tc')
+ if self.pre_processor.column_index(tc) == column_index
+ ]
+
+ def _get_rowspan(self, el, v_merge):
+ restart_in_v_merge = False
+ if v_merge is not None and 'val' in v_merge.attrib:
+ restart_in_v_merge = 'restart' in v_merge.attrib['val']
+
+ if not restart_in_v_merge:
+ return ''
+
+ current_row = self.pre_processor.row_index(el)
+ current_col = self.pre_processor.column_index(el)
+ rowspan = 1
+ result = ''
+ tbl = find_ancestor_with_tag(self.pre_processor, el, 'tbl')
+ # We only want table cells that have a higher row_index that is greater
+ # than the current_row and that are on the current_col
+ if tbl is None:
+ return ''
+
+ tcs = [
+ tc for tc in self.memod_tree_op(
+ '_get_tcs_in_column', tbl, current_col,
+ ) if self.pre_processor.row_index(tc) >= current_row
+ ]
+
+ def should_increment_rowspan(tc):
+ if not self.pre_processor.vmerge_continue(tc):
+ return False
+ return True
+
+ for tc in tcs:
+ if should_increment_rowspan(tc):
+ rowspan += 1
+ else:
+ rowspan = 1
+ if rowspan > 1:
+ result = rowspan
+ return str(result)
+
+ def get_colspan(self, el):
+ grid_span = find_first(el, 'gridSpan')
+ if grid_span is None:
+ return ''
+ return grid_span.attrib['val']
+
+ def parse_table_cell_contents(self, el, text):
+ parsed = text
+
+ next_el = self.pre_processor.next(el)
+ if next_el is not None:
+ if self._should_append_break_tag(next_el):
+ parsed += self.break_tag()
+ return parsed
+
+ def parse_hyperlink(self, el, text):
+ rId = el.get('id')
+ href = self.rels_dict.get(rId)
+ if not href:
return text
- else:
+ href = self.escape(href)
+ return self.hyperlink(text, href)
+
+ def _get_image_id(self, el):
+ # Drawings
+ blip = find_first(el, 'blip')
+ if blip is not None:
+ # On drawing tags the id is actually whatever is returned from the
+ # embed attribute on the blip tag. Thanks a lot Microsoft.
+ return blip.get('embed')
+ # Picts
+ imagedata = find_first(el, 'imagedata')
+ if imagedata is not None:
+ return imagedata.get('id')
+
+ def _convert_image_size(self, size):
+ return size / EMUS_PER_PIXEL
+
+ def _get_image_size(self, el):
+ """
+ If we can't find a height or width, return 0 for whichever is not
+ found, then rely on the `image` handler to strip those attributes. This
+ functionality can change once we integrate PIL.
+ """
+ sizes = find_first(el, 'ext')
+ if sizes is not None and sizes.get('cx'):
+ if sizes.get('cx'):
+ x = self._convert_image_size(int(sizes.get('cx')))
+ if sizes.get('cy'):
+ y = self._convert_image_size(int(sizes.get('cy')))
+ return (
+ '%dpx' % x,
+ '%dpx' % y,
+ )
+ shape = find_first(el, 'shape')
+ if shape is not None and shape.get('style') is not None:
+ # If either of these are not set, rely on the method `image` to not
+ # use either of them.
+ x = 0
+ y = 0
+ styles = shape.get('style').split(';')
+
+ for s in styles:
+ if s.startswith('height:'):
+ y = s.split(':')[1]
+ if s.startswith('width:'):
+ x = s.split(':')[1]
+ return x, y
+ return 0, 0
+
+ def parse_image(self, el):
+ x, y = self._get_image_size(el)
+ rId = self._get_image_id(el)
+ src = self.rels_dict.get(rId)
+ if not src:
+ return ''
+ src = os.path.join(
+ 'word',
+ src,
+ )
+ if src in self._image_data:
+ filename = os.path.split(src)[-1]
+ return self.image(self._image_data[src], filename, x, y)
+ return ''
+
+ def _is_style_on(self, value):
+ """
+ For b, i, u (bold, italics, and underline) merely having the tag is not
+ sufficient. You need to check to make sure it is not set to "false" as
+ well.
+ """
+ return value not in DISABLED_STYLE_VALUES
+
+ def parse_t(self, el, parsed):
+ if el.text is None:
+ return ''
+ return self.escape(el.text)
+
+ def parse_tab(self, el, parsed):
+ return self.tab()
+
+ def parse_hyphen(self, el, parsed):
+ return '-'
+
+ def parse_break_tag(self, el, parsed):
+ return self.break_tag()
+
+ def parse_deletion(self, el, parsed):
+ if el.text is None:
return ''
+ return self.deletion(el.text, '', '')
- def get_list_style(self, numval):
- ids = self.numbering_root.findall_all('num')
- for _id in ids:
- if _id.attrib['numId'] == numval:
- abstractid = _id.find('abstractNumId')
- abstractid = abstractid.attrib['val']
- style_information = self.numbering_root.findall_all(
- 'abstractNum',
- )
- for info in style_information:
- if info.attrib['abstractNumId'] == abstractid:
- for i in el_iter(info):
- if i.find('numFmt') is not None:
- return i.find('numFmt').attrib
-
- def get_comments(self, doc_id):
- if self.comment_store is None:
- # TODO throw appropriate error
- comment_root = ElementTree.fromstring(
- remove_namespaces(self.comment_text),
+ def parse_insertion(self, el, parsed):
+ return self.insertion(parsed, '', '')
+
+ def parse_r(self, el, parsed):
+ """
+ Parse the running text.
+ """
+ text = parsed
+ if not text:
+ return ''
+
+ run_properties = {}
+
+ # Get the rPr for the current style, they are the defaults.
+ p = find_ancestor_with_tag(self.pre_processor, el, 'p')
+ paragraph_style = self.memod_tree_op('find_first', p, 'pStyle')
+ if paragraph_style is not None:
+ style = paragraph_style.get('val')
+ style_defaults = self.styles_dict.get(style, {})
+ run_properties.update(
+ style_defaults.get('default_run_properties', {}),
)
- ids_and_info = {}
- ids = comment_root.findall_all('comment')
- for _id in ids:
- ids_and_info[_id.attrib['id']] = {
- "author": _id.attrib['author'],
- "date": _id.attrib['date'],
- "text": _id.findall_all('t')[0].text,
- }
- self.comment_store = ids_and_info
- return self.comment_store[doc_id]
+
+ # Get the rPr for the current r tag, they are overrides.
+ run_properties_element = el.find('rPr')
+ if run_properties_element:
+ local_run_properties = self._parse_run_properties(
+ run_properties_element,
+ )
+ run_properties.update(local_run_properties)
+
+ inline_tag_handlers = {
+ 'b': self.bold,
+ 'i': self.italics,
+ 'u': self.underline,
+ 'caps': self.caps,
+ 'smallCaps': self.small_caps,
+ 'strike': self.strike,
+ 'dstrike': self.strike,
+ 'vanish': self.hide,
+ 'webHidden': self.hide,
+ }
+ styles_needing_application = []
+ for property_name, property_value in run_properties.items():
+ # These tags are a little different, handle them separately
+ # from the rest.
+ # This could be a superscript or a subscript
+ if property_name == 'vertAlign':
+ if property_value == 'superscript':
+ styles_needing_application.append(self.superscript)
+ elif property_value == 'subscript':
+ styles_needing_application.append(self.subscript)
+ else:
+ if (
+ property_name in inline_tag_handlers and
+ self._is_style_on(property_value)
+ ):
+ styles_needing_application.append(
+ inline_tag_handlers[property_name],
+ )
+
+ # Apply all the handlers.
+ for func in styles_needing_application:
+ text = func(text)
+
+ return text
@property
def parsed(self):
@@ -335,10 +727,26 @@ def linebreak(self):
def paragraph(self, text):
return text
+ @abstractmethod
+ def heading(self, text, heading_level):
+ return text
+
@abstractmethod
def insertion(self, text, author, date):
return text
+ @abstractmethod
+ def hyperlink(self, text, href):
+ return text
+
+ @abstractmethod
+ def image_handler(self, path):
+ return path
+
+ @abstractmethod
+ def image(self, data, filename, x, y):
+ return self.image_handler(data)
+
@abstractmethod
def deletion(self, text, author, date):
return text
@@ -355,6 +763,30 @@ def italics(self, text):
def underline(self, text):
return text
+ @abstractmethod
+ def caps(self, text):
+ return text
+
+ @abstractmethod
+ def small_caps(self, text):
+ return text
+
+ @abstractmethod
+ def strike(self, text):
+ return text
+
+ @abstractmethod
+ def hide(self, text):
+ return text
+
+ @abstractmethod
+ def superscript(self, text):
+ return text
+
+ @abstractmethod
+ def subscript(self, text):
+ return text
+
@abstractmethod
def tab(self):
return True
@@ -388,15 +820,5 @@ def page_break(self):
return True
@abstractmethod
- def right_justify(self, text):
- return text
-
- @abstractmethod
- def center_justify(self, text):
- return text
-
- @abstractmethod
- def indent(self, text, left=None, right=None, firstLine=None):
- return text
-
- #TODO JUSTIFIED JUSTIFIED TEXT
+ def indent(self, text, left='', right='', firstLine=''):
+ return text # TODO JUSTIFIED JUSTIFIED TEXT
diff --git a/pydocx/__init__.py b/pydocx/__init__.py
index 9b42e00f..e59babb6 100644
--- a/pydocx/__init__.py
+++ b/pydocx/__init__.py
@@ -1,8 +1,34 @@
-from .parsers import *
+import sys
+from .parsers import Docx2Html, Docx2Markdown
+
def docx2html(path):
return Docx2Html(path).parsed
+
def docx2markdown(path):
return Docx2Markdown(path).parsed
+VERSION = '0.3.13'
+
+
+def main():
+ try:
+ parser_to_use = sys.argv[1]
+ path_to_docx = sys.argv[2]
+ path_to_html = sys.argv[3]
+ except IndexError:
+ print 'Must specify which parser as well as the file to convert and the name of the resulting file.' # noqa
+ sys.exit()
+ if parser_to_use == '--html':
+ html = Docx2Html(path_to_docx).parsed
+ elif parser_to_use == '--markdown':
+ html = Docx2Markdown(path_to_docx).parsed
+ else:
+ print 'Only valid parsers are --html and --markdown'
+ sys.exit()
+ with open(path_to_html, 'w') as f:
+ f.write(html.encode('utf-8'))
+
+if __name__ == '__main__':
+ main()
diff --git a/pydocx/exceptions.py b/pydocx/exceptions.py
new file mode 100644
index 00000000..cdff556a
--- /dev/null
+++ b/pydocx/exceptions.py
@@ -0,0 +1,2 @@
+class MalformedDocxException(Exception):
+ pass
diff --git a/pydocx/fixtures/all_configured_styles.docx b/pydocx/fixtures/all_configured_styles.docx
new file mode 100644
index 00000000..8f514372
Binary files /dev/null and b/pydocx/fixtures/all_configured_styles.docx differ
diff --git a/pydocx/fixtures/attachment_is_tiff.docx b/pydocx/fixtures/attachment_is_tiff.docx
new file mode 100644
index 00000000..774362ca
Binary files /dev/null and b/pydocx/fixtures/attachment_is_tiff.docx differ
diff --git a/pydocx/fixtures/bigger_font_size_to_header.docx b/pydocx/fixtures/bigger_font_size_to_header.docx
new file mode 100644
index 00000000..c722888b
Binary files /dev/null and b/pydocx/fixtures/bigger_font_size_to_header.docx differ
diff --git a/pydocx/fixtures/convert_p_to_h.docx b/pydocx/fixtures/convert_p_to_h.docx
new file mode 100644
index 00000000..53769e15
Binary files /dev/null and b/pydocx/fixtures/convert_p_to_h.docx differ
diff --git a/pydocx/fixtures/fake_headings_by_length.docx b/pydocx/fixtures/fake_headings_by_length.docx
new file mode 100644
index 00000000..a130f5ba
Binary files /dev/null and b/pydocx/fixtures/fake_headings_by_length.docx differ
diff --git a/pydocx/fixtures/greek_alphabet.docx b/pydocx/fixtures/greek_alphabet.docx
new file mode 100644
index 00000000..46ab5429
Binary files /dev/null and b/pydocx/fixtures/greek_alphabet.docx differ
diff --git a/pydocx/fixtures/has_image.docx b/pydocx/fixtures/has_image.docx
new file mode 100644
index 00000000..2ebd0bd0
Binary files /dev/null and b/pydocx/fixtures/has_image.docx differ
diff --git a/pydocx/fixtures/has_missing_image.docx b/pydocx/fixtures/has_missing_image.docx
new file mode 100644
index 00000000..996e6671
Binary files /dev/null and b/pydocx/fixtures/has_missing_image.docx differ
diff --git a/pydocx/fixtures/has_title.docx b/pydocx/fixtures/has_title.docx
new file mode 100644
index 00000000..a87d88ed
Binary files /dev/null and b/pydocx/fixtures/has_title.docx differ
diff --git a/pydocx/fixtures/header_footer_problem.docx b/pydocx/fixtures/header_footer_problem.docx
new file mode 100644
index 00000000..6bc49a7a
Binary files /dev/null and b/pydocx/fixtures/header_footer_problem.docx differ
diff --git a/pydocx/fixtures/headers.docx b/pydocx/fixtures/headers.docx
new file mode 100644
index 00000000..890104c7
Binary files /dev/null and b/pydocx/fixtures/headers.docx differ
diff --git a/pydocx/fixtures/headers_with_full_line_styles.docx b/pydocx/fixtures/headers_with_full_line_styles.docx
new file mode 100644
index 00000000..38d6f6a8
Binary files /dev/null and b/pydocx/fixtures/headers_with_full_line_styles.docx differ
diff --git a/pydocx/fixtures/include_tabs.docx b/pydocx/fixtures/include_tabs.docx
new file mode 100644
index 00000000..f7f53e92
Binary files /dev/null and b/pydocx/fixtures/include_tabs.docx differ
diff --git a/pydocx/fixtures/inline_tags.docx b/pydocx/fixtures/inline_tags.docx
new file mode 100644
index 00000000..4aba2347
Binary files /dev/null and b/pydocx/fixtures/inline_tags.docx differ
diff --git a/pydocx/fixtures/justification.docx b/pydocx/fixtures/justification.docx
new file mode 100644
index 00000000..7f8a3bf1
Binary files /dev/null and b/pydocx/fixtures/justification.docx differ
diff --git a/pydocx/fixtures/list_in_table.docx b/pydocx/fixtures/list_in_table.docx
new file mode 100644
index 00000000..d1a87388
Binary files /dev/null and b/pydocx/fixtures/list_in_table.docx differ
diff --git a/pydocx/fixtures/list_to_header.docx b/pydocx/fixtures/list_to_header.docx
new file mode 100644
index 00000000..f9b3946e
Binary files /dev/null and b/pydocx/fixtures/list_to_header.docx differ
diff --git a/pydocx/fixtures/lists_with_styles.docx b/pydocx/fixtures/lists_with_styles.docx
new file mode 100644
index 00000000..c1c7ecf8
Binary files /dev/null and b/pydocx/fixtures/lists_with_styles.docx differ
diff --git a/pydocx/fixtures/localDpi.docx b/pydocx/fixtures/localDpi.docx
new file mode 100644
index 00000000..0f6d7f77
Binary files /dev/null and b/pydocx/fixtures/localDpi.docx differ
diff --git a/pydocx/fixtures/missing_content.docx b/pydocx/fixtures/missing_content.docx
new file mode 100644
index 00000000..21bed964
Binary files /dev/null and b/pydocx/fixtures/missing_content.docx differ
diff --git a/pydocx/fixtures/missing_numbering.docx b/pydocx/fixtures/missing_numbering.docx
new file mode 100644
index 00000000..5034f524
Binary files /dev/null and b/pydocx/fixtures/missing_numbering.docx differ
diff --git a/pydocx/fixtures/missing_style.docx b/pydocx/fixtures/missing_style.docx
new file mode 100644
index 00000000..3ded985c
Binary files /dev/null and b/pydocx/fixtures/missing_style.docx differ
diff --git a/pydocx/fixtures/nested_lists.docx b/pydocx/fixtures/nested_lists.docx
new file mode 100644
index 00000000..0f9cecbd
Binary files /dev/null and b/pydocx/fixtures/nested_lists.docx differ
diff --git a/pydocx/fixtures/nested_table_rowspan.docx b/pydocx/fixtures/nested_table_rowspan.docx
new file mode 100644
index 00000000..b43b8a0d
Binary files /dev/null and b/pydocx/fixtures/nested_table_rowspan.docx differ
diff --git a/pydocx/fixtures/nested_tables.docx b/pydocx/fixtures/nested_tables.docx
new file mode 100644
index 00000000..af704d4d
Binary files /dev/null and b/pydocx/fixtures/nested_tables.docx differ
diff --git a/pydocx/fixtures/no_break_hyphen.docx b/pydocx/fixtures/no_break_hyphen.docx
new file mode 100644
index 00000000..64d68fa3
Binary files /dev/null and b/pydocx/fixtures/no_break_hyphen.docx differ
diff --git a/pydocx/fixtures/resized_image.docx b/pydocx/fixtures/resized_image.docx
new file mode 100644
index 00000000..913099c4
Binary files /dev/null and b/pydocx/fixtures/resized_image.docx differ
diff --git a/pydocx/fixtures/shift_enter.docx b/pydocx/fixtures/shift_enter.docx
new file mode 100644
index 00000000..4128c0a2
Binary files /dev/null and b/pydocx/fixtures/shift_enter.docx differ
diff --git a/pydocx/fixtures/simple.docx b/pydocx/fixtures/simple.docx
new file mode 100644
index 00000000..1d2a1c23
Binary files /dev/null and b/pydocx/fixtures/simple.docx differ
diff --git a/pydocx/fixtures/simple_lists.docx b/pydocx/fixtures/simple_lists.docx
new file mode 100644
index 00000000..c09ad744
Binary files /dev/null and b/pydocx/fixtures/simple_lists.docx differ
diff --git a/pydocx/fixtures/simple_table.docx b/pydocx/fixtures/simple_table.docx
new file mode 100644
index 00000000..26de483c
Binary files /dev/null and b/pydocx/fixtures/simple_table.docx differ
diff --git a/pydocx/fixtures/special_chars.docx b/pydocx/fixtures/special_chars.docx
new file mode 100644
index 00000000..b4b9287f
Binary files /dev/null and b/pydocx/fixtures/special_chars.docx differ
diff --git a/pydocx/fixtures/split_header.docx b/pydocx/fixtures/split_header.docx
new file mode 100644
index 00000000..cc4bd5cf
Binary files /dev/null and b/pydocx/fixtures/split_header.docx differ
diff --git a/pydocx/fixtures/styled_bolding.docx b/pydocx/fixtures/styled_bolding.docx
new file mode 100644
index 00000000..90c6b157
Binary files /dev/null and b/pydocx/fixtures/styled_bolding.docx differ
diff --git a/pydocx/fixtures/super_and_subscript.docx b/pydocx/fixtures/super_and_subscript.docx
new file mode 100644
index 00000000..06ea2d7a
Binary files /dev/null and b/pydocx/fixtures/super_and_subscript.docx differ
diff --git a/pydocx/fixtures/table_col_row_span.docx b/pydocx/fixtures/table_col_row_span.docx
new file mode 100644
index 00000000..856abfdf
Binary files /dev/null and b/pydocx/fixtures/table_col_row_span.docx differ
diff --git a/pydocx/fixtures/tables_in_lists.docx b/pydocx/fixtures/tables_in_lists.docx
new file mode 100644
index 00000000..11859541
Binary files /dev/null and b/pydocx/fixtures/tables_in_lists.docx differ
diff --git a/pydocx/fixtures/track_changes_on.docx b/pydocx/fixtures/track_changes_on.docx
new file mode 100644
index 00000000..dcb7ba1c
Binary files /dev/null and b/pydocx/fixtures/track_changes_on.docx differ
diff --git a/pydocx/fixtures/upper_alpha_all_bold.docx b/pydocx/fixtures/upper_alpha_all_bold.docx
new file mode 100644
index 00000000..d518b2c5
Binary files /dev/null and b/pydocx/fixtures/upper_alpha_all_bold.docx differ
diff --git a/pydocx/lxmlparser.py b/pydocx/lxmlparser.py
deleted file mode 100644
index 94b130d3..00000000
--- a/pydocx/lxmlparser.py
+++ /dev/null
@@ -1,111 +0,0 @@
-import zipfile
-from lxml import etree
-from StringIO import StringIO
-__author__ = 'samportnow'
-
-#for el in tree.iter():
- # The way lists are handled could double visit certain elements; keep
- # track of which elements have been visited and skip any that have been
- # visited already.
- #if el in visited_nodes:
- #continue
-with zipfile.ZipFile('/Users/samportnow/Documents/pydocx/helloworld.docx') as f:
- document = f.read('word/document.xml')
- numbering= f.read('word/numbering.xml')
-parser=etree.XMLParser(ns_clean=True)
-document=StringIO(document)
-numbering=StringIO(numbering)
-numbering_tree=etree.parse(numbering,parser)
-numbering_namespace=numbering_tree.getroot().nsmap['w']
-visited_els=[]
-
-def get_parsed():
- parser=etree.XMLParser(ns_clean=True)
- tree=etree.parse(document,parser)
- namespace=tree.getroot().nsmap['w']
- #rpr is run properties for the paragraph mark
- paragraph=''
- run_text=''
- running_text=''
- for el in tree.iter():
- if el.tag=='{%s}p' %namespace:
- for wp in el.iter():
- if wp.tag =='{%s}ins' %namespace:
- for text in wp.iterchildren():
- if text not in visited_els:
- run_text +=''+get_text(text,namespace,visited_els)+'
'
- visited_els.append(text)
- if wp.tag=='{%s}r' %namespace and wp not in visited_els:
- run_text+=get_text(wp,namespace,visited_els)
- visited_els.append(wp)
- if not el.getchildren():
- run_text+='
'
- if wp.tag == '{%s}ilvl' %namespace:
- for lst in el.iter():
- if lst.find('{%s}numId' %namespace) is not None and el not in visited_els:
- numval = lst.find('{%s}numId' %namespace).attrib['{%s}val' %namespace]
- lst_type=get_list_style(numval)
- if get_text(lst,namespace,visited_els) and el not in visited_els and lst_type['{%s}val' %namespace] != 'bullet':
- if lst.getnext() is not None:
- if lst not in visited_els:
- while lst.getnext() is not None:
- if lst not in visited_els:
- text = get_text(lst,namespace,visited_els)
- next_txt = get_text(lst.getnext(),namespace,visited_els)
- running_text += text + next_txt
- visited_els.append(lst)
- visited_els.append(lst.getnext())
- lst=lst.getnext()
- else:
- run_text += '' + running_text + ''
- break
- else:
- run_text +='' + get_text(lst, namespace, visited_els) + ''
- visited_els.append(lst)
- print running_text
- return run_text
-
-
-def get_text(wp,namespace,visited_els):
- run_text= ''
- decorator = ''
- closing = ''
- if wp.find('{%s}tab' %namespace) is not None:
- run_text+='%nbsp'
- if wp.find('{%s}rPr' %namespace) is not None:
- for tag in wp.iter():
- if tag.find('{%s}u' %namespace) is not None:
- if wp.find('{%s}t' %namespace) is not None:
- decorator +=''
- closing += ''
- visited_els.append(wp.find('{%s}t' %namespace))
- if tag.find('{%s}i' %namespace) is not None:
- if wp.find('{%s}t' %namespace) is not None:
- decorator += ''
- closing += ''
- visited_els.append(wp.find('{%s}t' %namespace))
- if tag.find('{%s}b' %namespace) is not None:
- if wp.find('{%s}t' %namespace) is not None:
- decorator += ''
- closing += ''
- visited_els.append(wp.find('{%s}t' %namespace))
- run_text = wp.find('{%s}t' %namespace).text
- run_text = decorator + run_text + closing
- if wp.find('{%s}t' %namespace) is not None and wp.find('{%s}t' %namespace) not in visited_els:
- run_text+=wp.find('{%s}t' %namespace).text
- return run_text
-
-def get_list_style(numval):
- ids = numbering_tree.findall('{%s}num' %numbering_namespace)
- for id in ids:
- if id.attrib['{%s}numId' %numbering_namespace] == numval:
- abstractid=id.find('{%s}abstractNumId' %numbering_namespace)
- abstractid=abstractid.attrib['{%s}val' %numbering_namespace]
- style_information=numbering_tree.findall('{%s}abstractNum' %numbering_namespace)
- for info in style_information:
- if info.attrib['{%s}abstractNumId' %numbering_namespace] == abstractid:
- for i in info.iter():
- if i.find('{%s}numFmt' %numbering_namespace) is not None:
- return i.find('{%s}numFmt' %numbering_namespace).attrib
-
-print get_parsed()
diff --git a/pydocx/parsers/Docx2Html.py b/pydocx/parsers/Docx2Html.py
index bfaad2a6..e4067a10 100644
--- a/pydocx/parsers/Docx2Html.py
+++ b/pydocx/parsers/Docx2Html.py
@@ -1,21 +1,46 @@
-from pydocx.DocxParser import DocxParser
-
+import base64
import xml.sax.saxutils
+from pydocx.DocxParser import DocxParser
+
class Docx2Html(DocxParser):
@property
def parsed(self):
- self._parsed = self._parsed.replace('', '
')
- self._parsed = self._parsed.replace('
', '
')
- self._parsed = self._parsed.replace('
', '')
- return (
- '{content}'
- ).format(content=self._parsed)
+ content = self._parsed
+ content = "%(head)s%(content)s" % {
+ 'head': self.head(),
+ 'content': content,
+ }
+ return unicode(content)
+
+ def head(self):
+ return "%(style)s" % {
+ 'style': self.style(),
+ }
+
+ def style(self):
+ result = (
+ ''
+ ) % {
+ #multiple by (4/3) to get to px
+ 'width': (self.page_width * (4 / 3)),
+ }
+ return result
def escape(self, text):
return xml.sax.saxutils.quoteattr(text)[1:-1]
@@ -26,61 +51,155 @@ def linebreak(self, pre=None):
def paragraph(self, text, pre=None):
return '' + text + '
'
+ def heading(self, text, heading_value):
+ return '<%(tag)s>%(text)s%(tag)s>' % {
+ 'tag': heading_value,
+ 'text': text,
+ }
+
def insertion(self, text, author, date):
return (
- "{text}"
- ).format(author=author, date=date, text=text)
+ "%(text)s"
+ ) % {
+ 'author': author,
+ 'date': date,
+ 'text': text,
+ }
+
+ def hyperlink(self, text, href):
+ if text == '':
+ return ''
+ return '%(text)s' % {
+ 'href': href,
+ 'text': text,
+ }
+
+ def image_handler(self, image_data, filename):
+ extension = filename.split('.')[-1].lower()
+ b64_encoded_src = 'data:image/%s;base64,%s' % (
+ extension,
+ base64.b64encode(image_data),
+ )
+ b64_encoded_src = self.escape(b64_encoded_src)
+ return b64_encoded_src
+
+ def image(self, image_data, filename, x, y):
+ src = self.image_handler(image_data, filename)
+ if not src:
+ return ''
+ if all([x, y]):
+ return '
' % (
+ src,
+ y,
+ x,
+ )
+ else:
+ return '
' % src
def deletion(self, text, author, date):
return (
- "{text}"
- ).format(author=author, date=date, text=text)
+ "%(text)s"
+ ) % {
+ 'author': author,
+ 'date': date,
+ 'text': text,
+ }
def list_element(self, text):
- return "- {text}
".format(text=text)
+ return "- %(text)s
" % {
+ 'text': text,
+ }
- def ordered_list(self, text):
- return "{text}
".format(text=text)
+ def ordered_list(self, text, list_style):
+ return '%(text)s
' % {
+ 'text': text,
+ 'list_style': list_style,
+ }
def unordered_list(self, text):
- return "".format(text=text)
+ return "" % {
+ 'text': text,
+ }
def bold(self, text):
- return '' + text + ''
+ return '' + text + ''
def italics(self, text):
- return '' + text + ''
+ return '' + text + ''
def underline(self, text):
- return '' + text + ''
+ return '' + text + ''
+
+ def caps(self, text):
+ return '' + text + ''
+
+ def small_caps(self, text):
+ return '' + text + ''
+
+ def strike(self, text):
+ return '' + text + ''
+
+ def hide(self, text):
+ return '' + text + ''
+
+ def superscript(self, text):
+ return '%(text)s' % {
+ 'text': text,
+ }
+
+ def subscript(self, text):
+ return '%(text)s' % {
+ 'text': text,
+ }
def tab(self):
# Insert before the text right?? So got the text and just do an insert
# at the beginning!
- return '    '
+ return ' '
def table(self, text):
- return ''
+ return ''
def table_row(self, text):
return '' + text + '
'
- def table_cell(self, text):
- return '' + text + ' | '
+ def table_cell(self, text, col='', row=''):
+ slug = '%(text)s | '
+ return slug % {
+ 'colspan': col,
+ 'rowspan': row,
+ 'text': text,
+ }
def page_break(self):
- return '
'
-
- def center_justify(self, text):
- return "" + text + '
'
-
- def right_justify(self, text):
- return "" + text + '
'
-
- def indent(self, text, right, left, firstLine):
- return "{text}
".format(
- left=left,
- text=text,
- )
+ return '
'
+
+ def indent(self, text, just='', firstLine='', left='', right=''):
+ slug = '%(text)s
"
+ return slug % {
+ 'text': text,
+ 'just': just,
+ 'firstLine': firstLine,
+ 'left': left,
+ 'right': right,
+ }
+
+ def break_tag(self):
+ return '
'
diff --git a/pydocx/parsers/Docx2Markdown.py b/pydocx/parsers/Docx2Markdown.py
index 1bb43e16..d023df7a 100644
--- a/pydocx/parsers/Docx2Markdown.py
+++ b/pydocx/parsers/Docx2Markdown.py
@@ -1,5 +1,6 @@
from pydocx.DocxParser import DocxParser
+
class Docx2Markdown(DocxParser):
def escape(self, text):
return text
@@ -17,8 +18,9 @@ def bold(self, text):
return '**' + text + '**'
def italics(self, text):
- # TODO do we need a "pre" variable, so I can check for *italics**italics* and turn it into *italicsitatlics*?
+ # TODO do we need a "pre" variable, so I can check for
+ # *italics**italics* and turn it into *italicsitatlics*?
return '*' + text + '*'
def underline(self, text):
- return '***' +text + '***'
\ No newline at end of file
+ return '***' + text + '***'
diff --git a/pydocx/parsers/__init__.py b/pydocx/parsers/__init__.py
index a9524657..7684ae65 100644
--- a/pydocx/parsers/__init__.py
+++ b/pydocx/parsers/__init__.py
@@ -1,2 +1,4 @@
-from .Docx2Html import *
-from .Docx2Markdown import *
\ No newline at end of file
+from pydocx.parsers.Docx2Html import Docx2Html
+from pydocx.parsers.Docx2Markdown import Docx2Markdown
+
+__all__ = (Docx2Html, Docx2Markdown)
diff --git a/pydocx/tests/__init__.py b/pydocx/tests/__init__.py
new file mode 100644
index 00000000..82341e05
--- /dev/null
+++ b/pydocx/tests/__init__.py
@@ -0,0 +1,196 @@
+#from unittest import TestCase
+import re
+from contextlib import contextmanager
+
+from pydocx.parsers.Docx2Html import Docx2Html
+from pydocx.utils import (
+ parse_xml_from_string,
+)
+from pydocx.tests.document_builder import DocxBuilder as DXB
+from unittest import TestCase
+
+STYLE = (
+ ''
+)
+
+BASE_HTML = '''
+
+
+ %s
+
+ %%s
+
+''' % STYLE
+
+
+def assert_html_equal(actual_html, expected_html):
+ assert collapse_html(
+ actual_html,
+ ) == collapse_html(
+ expected_html
+ ), actual_html
+
+
+def collapse_html(html):
+ """
+ Remove insignificant whitespace from the html.
+
+ >>> print collapse_html('''\\
+ ...
+ ... Heading
+ ...
+ ... ''')
+ Heading
+ >>> print collapse_html('''\\
+ ...
+ ... Paragraph with
+ ... multiple lines.
+ ...
+ ... ''')
+ Paragraph with multiple lines.
+ """
+ def smart_space(match):
+ # Put a space in between lines, unless exactly one side of the line
+ # break butts up against a tag.
+ before = match.group(1)
+ after = match.group(2)
+ space = ' '
+ if before == '>' or after == '<':
+ space = ''
+ return before + space + after
+ # Replace newlines and their surrounding whitespace with a single space (or
+ # empty string)
+ html = re.sub(
+ r'(>?)\s*\n\s*()',
+ smart_space,
+ html,
+ )
+ return html.strip()
+
+
+class XMLDocx2Html(Docx2Html):
+ """
+ Create the object without passing in a path to the document, set them
+ manually.
+ """
+ def __init__(self, *args, **kwargs):
+ # Pass in nothing for the path
+ super(XMLDocx2Html, self).__init__(path=None, *args, **kwargs)
+
+ def _build_data(
+ self,
+ path,
+ document_xml=None,
+ rels_dict=None,
+ numbering_dict=None,
+ styles_dict=None,
+ *args, **kwargs):
+ self._test_rels_dict = rels_dict
+ if rels_dict:
+ for value in rels_dict.values():
+ self._image_data['word/%s' % value] = 'word/%s' % value
+ self.numbering_root = None
+ if numbering_dict is not None:
+ self.numbering_root = parse_xml_from_string(
+ DXB.numbering(numbering_dict),
+ )
+ self.numbering_dict = numbering_dict
+ # Intentionally not calling super
+ if document_xml is not None:
+ self.root = parse_xml_from_string(document_xml)
+ self.zip_path = ''
+
+ # This is the standard page width for a word document, Also the page
+ # width that we are looking for in the test.
+ self.page_width = 612
+
+ self.styles_dict = styles_dict
+
+ def _parse_rels_root(self, *args, **kwargs):
+ if self._test_rels_dict is None:
+ return {}
+ return self._test_rels_dict
+
+ def get_list_style(self, num_id, ilvl):
+ try:
+ return self.numbering_dict[num_id][ilvl]
+ except KeyError:
+ return 'decimal'
+
+ def _parse_styles(self):
+ if self.styles_dict is None:
+ return {}
+ return self.styles_dict
+
+
+DEFAULT_NUMBERING_DICT = {
+ '1': {
+ '0': 'decimal',
+ '1': 'decimal',
+ },
+ '2': {
+ '0': 'lowerLetter',
+ '1': 'lowerLetter',
+ },
+}
+
+
+class _TranslationTestCase(TestCase):
+ expected_output = None
+ relationship_dict = None
+ styles_dict = None
+ numbering_dict = DEFAULT_NUMBERING_DICT
+ run_expected_output = True
+ parser = XMLDocx2Html
+ use_base_html = True
+ convert_root_level_upper_roman = False
+
+ def get_xml(self):
+ raise NotImplementedError()
+
+ @contextmanager
+ def toggle_run_expected_output(self):
+ self.run_expected_output = not self.run_expected_output
+ yield
+ self.run_expected_output = not self.run_expected_output
+
+ def test_expected_output(self):
+ if self.expected_output is None:
+ raise NotImplementedError('expected_output is not defined')
+ if not self.run_expected_output:
+ return
+
+ # Create the xml
+ tree = self.get_xml()
+
+ # Verify the final output.
+ parser = self.parser
+
+ def image_handler(self, src, *args, **kwargs):
+ return src
+ parser.image_handler = image_handler
+ html = parser(
+ convert_root_level_upper_roman=self.convert_root_level_upper_roman,
+ document_xml=tree,
+ rels_dict=self.relationship_dict,
+ numbering_dict=self.numbering_dict,
+ styles_dict=self.styles_dict,
+ ).parsed
+
+ if self.use_base_html:
+ assert_html_equal(html, BASE_HTML % self.expected_output)
+ else:
+ assert_html_equal(html, self.expected_output)
diff --git a/pydocx/tests/document_builder.py b/pydocx/tests/document_builder.py
new file mode 100644
index 00000000..c28e1e02
--- /dev/null
+++ b/pydocx/tests/document_builder.py
@@ -0,0 +1,271 @@
+from jinja2 import Environment, PackageLoader
+from pydocx.DocxParser import EMUS_PER_PIXEL
+
+templates = {
+ 'delete': 'text_delete.xml',
+ 'drawing': 'drawing.xml',
+ 'hyperlink': 'hyperlink.xml',
+ 'insert': 'insert.xml',
+ 'linebreak': 'linebreak.xml',
+ 'main': 'base.xml',
+ 'numbering': 'numbering.xml',
+ 'p': 'p.xml',
+ 'pict': 'pict.xml',
+ 'r': 'r.xml',
+ 'rpr': 'rpr.xml',
+ 'sdt': 'sdt.xml',
+ 'sectPr': 'sectPr.xml',
+ 'smartTag': 'smart_tag.xml',
+ 'style': 'style.xml',
+ 'styles': 'styles.xml',
+ 't': 't.xml',
+ 'table': 'table.xml',
+ 'tc': 'tc.xml',
+ 'tr': 'tr.xml',
+}
+
+env = Environment(
+ loader=PackageLoader(
+ 'pydocx.tests',
+ 'templates',
+ ),
+)
+
+
+class DocxBuilder(object):
+
+ @classmethod
+ def xml(self, body):
+ template = env.get_template(templates['main'])
+ return template.render(body=body)
+
+ @classmethod
+ def p_tag(
+ self,
+ text,
+ style='style0',
+ jc=None,
+ ):
+ if isinstance(text, str):
+ # Use create a single r tag based on the text and the bold
+ run_tag = DocxBuilder.r_tag(
+ [DocxBuilder.t_tag(text)],
+ )
+ run_tags = [run_tag]
+ elif isinstance(text, list):
+ run_tags = text
+ else:
+ run_tags = [self.r_tag([])]
+ template = env.get_template(templates['p'])
+
+ kwargs = {
+ 'run_tags': run_tags,
+ 'style': style,
+ 'jc': jc,
+ }
+ return template.render(**kwargs)
+
+ @classmethod
+ def linebreak(self):
+ template = env.get_template(templates['linebreak'])
+ kwargs = {}
+ return template.render(**kwargs)
+
+ @classmethod
+ def t_tag(self, text):
+ template = env.get_template(templates['t'])
+ kwargs = {
+ 'text': text,
+ }
+ return template.render(**kwargs)
+
+ @classmethod
+ def r_tag(
+ self,
+ elements,
+ rpr=None,
+ ):
+ template = env.get_template(templates['r'])
+ if rpr is None:
+ rpr = DocxBuilder.rpr_tag()
+ kwargs = {
+ 'elements': elements,
+ 'rpr': rpr,
+ }
+ return template.render(**kwargs)
+
+ @classmethod
+ def rpr_tag(self, inline_styles=None, *args, **kwargs):
+ if inline_styles is None:
+ inline_styles = {}
+ valid_styles = (
+ 'b',
+ 'i',
+ 'u',
+ 'caps',
+ 'smallCaps',
+ 'strike',
+ 'dstrike',
+ 'vanish',
+ 'webHidden',
+ 'vertAlign',
+ )
+ for key in inline_styles:
+ if key not in valid_styles:
+ raise AssertionError('%s is not a valid style' % key)
+ template = env.get_template(templates['rpr'])
+ kwargs = {
+ 'tags': inline_styles,
+ }
+ return template.render(**kwargs)
+
+ @classmethod
+ def hyperlink_tag(self, r_id, run_tags):
+ template = env.get_template(templates['hyperlink'])
+ kwargs = {
+ 'r_id': r_id,
+ 'run_tags': run_tags,
+ }
+ return template.render(**kwargs)
+
+ @classmethod
+ def insert_tag(self, run_tags):
+ template = env.get_template(templates['insert'])
+ kwargs = {
+ 'run_tags': run_tags,
+ }
+ return template.render(**kwargs)
+
+ @classmethod
+ def delete_tag(self, deleted_texts):
+ template = env.get_template(templates['delete'])
+ kwargs = {
+ 'deleted_texts': deleted_texts,
+ }
+ return template.render(**kwargs)
+
+ @classmethod
+ def smart_tag(self, run_tags):
+ template = env.get_template(templates['smartTag'])
+ kwargs = {
+ 'run_tags': run_tags,
+ }
+ return template.render(**kwargs)
+
+ @classmethod
+ def sdt_tag(self, p_tag):
+ template = env.get_template(templates['sdt'])
+ kwargs = {
+ 'p_tag': p_tag,
+ }
+ return template.render(**kwargs)
+
+ @classmethod
+ def li(self, text, ilvl, numId, bold=False):
+ if isinstance(text, str):
+ # Use create a single r tag based on the text and the bold
+ run_tag = DocxBuilder.r_tag([DocxBuilder.t_tag(text)], bold)
+ run_tags = [run_tag]
+ elif isinstance(text, list):
+ run_tags = []
+ for run_text, run_bold in text:
+ run_tags.append(
+ DocxBuilder.r_tag(
+ [DocxBuilder.t_tag(run_tags)],
+ run_bold,
+ ),
+ )
+ else:
+ raise AssertionError('text must be a string or a list')
+ template = env.get_template(templates['p'])
+
+ kwargs = {
+ 'run_tags': run_tags,
+ 'is_list': True,
+ 'ilvl': ilvl,
+ 'numId': numId,
+ }
+ return template.render(**kwargs)
+
+ @classmethod
+ def table_cell(self, paragraph, merge=False, merge_continue=False):
+ kwargs = {
+ 'paragraph': paragraph,
+ 'merge': merge,
+ 'merge_continue': merge_continue
+ }
+ template = env.get_template(templates['tc'])
+ return template.render(**kwargs)
+
+ @classmethod
+ def table_row(self, tcs):
+ template = env.get_template(templates['tr'])
+ return template.render(table_cells=tcs)
+
+ @classmethod
+ def table(self, trs):
+ template = env.get_template(templates['table'])
+ return template.render(table_rows=trs)
+
+ @classmethod
+ def drawing(self, r_id, height=None, width=None):
+ template = env.get_template(templates['drawing'])
+ if height is not None:
+ height = height * EMUS_PER_PIXEL
+ if width is not None:
+ width = width * EMUS_PER_PIXEL
+ kwargs = {
+ 'r_id': r_id,
+ 'height': height,
+ 'width': width,
+ }
+ return template.render(**kwargs)
+
+ @classmethod
+ def pict(self, r_id=None, height=None, width=None):
+ template = env.get_template(templates['pict'])
+ kwargs = {
+ 'r_id': r_id,
+ 'height': height,
+ 'width': width,
+ }
+ return template.render(**kwargs)
+
+ @classmethod
+ def sectPr_tag(self, p_tag):
+ template = env.get_template(templates['sectPr'])
+
+ kwargs = {
+ 'p_tag': p_tag,
+ }
+ return template.render(**kwargs)
+
+ @classmethod
+ def styles_xml(self, style_tags):
+ template = env.get_template(templates['styles'])
+
+ kwargs = {
+ 'style_tags': style_tags,
+ }
+ return template.render(**kwargs)
+
+ @classmethod
+ def style(self, style_id, value):
+ template = env.get_template(templates['style'])
+
+ kwargs = {
+ 'style_id': style_id,
+ 'value': value,
+ }
+
+ return template.render(**kwargs)
+
+ @classmethod
+ def numbering(self, numbering_dict):
+ template = env.get_template(templates['numbering'])
+
+ kwargs = {
+ 'numbering_dict': numbering_dict,
+ }
+
+ return template.render(**kwargs)
diff --git a/pydocx/tests/templates/base.xml b/pydocx/tests/templates/base.xml
new file mode 100644
index 00000000..60027500
--- /dev/null
+++ b/pydocx/tests/templates/base.xml
@@ -0,0 +1,4 @@
+
+
+ {{ body }}
+
diff --git a/pydocx/tests/templates/drawing.xml b/pydocx/tests/templates/drawing.xml
new file mode 100644
index 00000000..dfd470b4
--- /dev/null
+++ b/pydocx/tests/templates/drawing.xml
@@ -0,0 +1,65 @@
+
+
+
+
+
+
+
+
+
+
+ 2397125
+
+
+ 0
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/pydocx/tests/templates/hyperlink.xml b/pydocx/tests/templates/hyperlink.xml
new file mode 100644
index 00000000..83645948
--- /dev/null
+++ b/pydocx/tests/templates/hyperlink.xml
@@ -0,0 +1,5 @@
+
+ {% for run_tag in run_tags %}
+ {{ run_tag }}
+ {% endfor %}
+
diff --git a/pydocx/tests/templates/insert.xml b/pydocx/tests/templates/insert.xml
new file mode 100644
index 00000000..afeb2691
--- /dev/null
+++ b/pydocx/tests/templates/insert.xml
@@ -0,0 +1,5 @@
+
+ {% for run_tag in run_tags %}
+ {{ run_tag }}
+ {% endfor %}
+
diff --git a/pydocx/tests/templates/linebreak.xml b/pydocx/tests/templates/linebreak.xml
new file mode 100644
index 00000000..ab92e811
--- /dev/null
+++ b/pydocx/tests/templates/linebreak.xml
@@ -0,0 +1 @@
+
diff --git a/pydocx/tests/templates/numbering.xml b/pydocx/tests/templates/numbering.xml
new file mode 100644
index 00000000..4eaac3cc
--- /dev/null
+++ b/pydocx/tests/templates/numbering.xml
@@ -0,0 +1,23 @@
+
+
+ {% for num_id, ilvl_data in numbering_dict.items() %}
+
+ {% for ilvl, format in ilvl_data.items() %}
+
+
+
+
+
+
+
+
+
+ {% endfor %}
+
+ {% endfor %}
+ {% for num_id in numbering_dict %}
+
+
+
+ {% endfor %}
+
diff --git a/pydocx/tests/templates/p.xml b/pydocx/tests/templates/p.xml
new file mode 100644
index 00000000..7a78a060
--- /dev/null
+++ b/pydocx/tests/templates/p.xml
@@ -0,0 +1,19 @@
+
+
+
+ {% if is_list %}
+
+ {% if ilvl != None %}
+
+ {% endif %}
+ {% if numId != None %}
+
+ {% endif %}
+
+ {% endif %}
+ {% if jc %}{% endif %}
+
+ {% for run_tag in run_tags %}
+ {{ run_tag }}
+ {% endfor %}
+
diff --git a/pydocx/tests/templates/pict.xml b/pydocx/tests/templates/pict.xml
new file mode 100644
index 00000000..26f772a3
--- /dev/null
+++ b/pydocx/tests/templates/pict.xml
@@ -0,0 +1,17 @@
+
+
+
+
+
+
+
+
+
+
+
+
+ {% if r_id %}{% endif %}
+
+
+
+
diff --git a/pydocx/tests/templates/r.xml b/pydocx/tests/templates/r.xml
new file mode 100644
index 00000000..2f28a66b
--- /dev/null
+++ b/pydocx/tests/templates/r.xml
@@ -0,0 +1,6 @@
+
+ {{ rpr }}
+ {% for element in elements %}
+ {{ element }}
+ {% endfor %}
+
diff --git a/pydocx/tests/templates/rpr.xml b/pydocx/tests/templates/rpr.xml
new file mode 100644
index 00000000..f49eb08b
--- /dev/null
+++ b/pydocx/tests/templates/rpr.xml
@@ -0,0 +1,5 @@
+
+ {% for tag, value in tags.items() %}
+
+ {% endfor %}
+
diff --git a/pydocx/tests/templates/sdt.xml b/pydocx/tests/templates/sdt.xml
new file mode 100644
index 00000000..fe9a7e77
--- /dev/null
+++ b/pydocx/tests/templates/sdt.xml
@@ -0,0 +1,16 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+ {{ p_tag }}
+
+
diff --git a/pydocx/tests/templates/sectPr.xml b/pydocx/tests/templates/sectPr.xml
new file mode 100644
index 00000000..16a12050
--- /dev/null
+++ b/pydocx/tests/templates/sectPr.xml
@@ -0,0 +1,3 @@
+
+ {{ p_tag }}
+
diff --git a/pydocx/tests/templates/smart_tag.xml b/pydocx/tests/templates/smart_tag.xml
new file mode 100644
index 00000000..e45ee5b9
--- /dev/null
+++ b/pydocx/tests/templates/smart_tag.xml
@@ -0,0 +1,5 @@
+
+ {% for run_tag in run_tags %}
+ {{ run_tag }}
+ {% endfor %}
+
diff --git a/pydocx/tests/templates/style.xml b/pydocx/tests/templates/style.xml
new file mode 100644
index 00000000..5fa9f00f
--- /dev/null
+++ b/pydocx/tests/templates/style.xml
@@ -0,0 +1,15 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/pydocx/tests/templates/styles.xml b/pydocx/tests/templates/styles.xml
new file mode 100644
index 00000000..a30e752e
--- /dev/null
+++ b/pydocx/tests/templates/styles.xml
@@ -0,0 +1,6 @@
+
+
+ {% for style in style_tags %}
+ {{ style }}
+ {% endfor %}
+
diff --git a/pydocx/tests/templates/t.xml b/pydocx/tests/templates/t.xml
new file mode 100644
index 00000000..81d562b7
--- /dev/null
+++ b/pydocx/tests/templates/t.xml
@@ -0,0 +1,5 @@
+{% if text %}
+{{ text }}
+{% else %}
+
+{% endif %}
diff --git a/pydocx/tests/templates/table.xml b/pydocx/tests/templates/table.xml
new file mode 100644
index 00000000..e47783b6
--- /dev/null
+++ b/pydocx/tests/templates/table.xml
@@ -0,0 +1,18 @@
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ {% for table_row in table_rows %}
+ {{ table_row }}
+ {% endfor %}
+
diff --git a/pydocx/tests/templates/tc.xml b/pydocx/tests/templates/tc.xml
new file mode 100644
index 00000000..eff9ce0d
--- /dev/null
+++ b/pydocx/tests/templates/tc.xml
@@ -0,0 +1,28 @@
+
+
+
+ {% if merge_continue %}
+
+
+ {% endif %}
+ {% if merge %}
+
+
+ {% endif %}
+
+
+
+
+
+
+
+
+
+
+
+
+
+ {% if paragraph %}
+ {{ paragraph }}
+ {% endif %}
+
diff --git a/pydocx/tests/templates/text_delete.xml b/pydocx/tests/templates/text_delete.xml
new file mode 100644
index 00000000..783b3ad3
--- /dev/null
+++ b/pydocx/tests/templates/text_delete.xml
@@ -0,0 +1,10 @@
+
+ {% for deleted_text in deleted_texts %}
+
+
+
+
+ {{ deleted_text }}
+
+ {% endfor %}
+
diff --git a/pydocx/tests/templates/tr.xml b/pydocx/tests/templates/tr.xml
new file mode 100644
index 00000000..6e2f6925
--- /dev/null
+++ b/pydocx/tests/templates/tr.xml
@@ -0,0 +1,8 @@
+
+
+
+
+ {% for table_cell in table_cells %}
+ {{ table_cell }}
+ {% endfor %}
+
diff --git a/pydocx/tests/test_docx.py b/pydocx/tests/test_docx.py
new file mode 100644
index 00000000..e9d77533
--- /dev/null
+++ b/pydocx/tests/test_docx.py
@@ -0,0 +1,849 @@
+import base64
+from os import path
+from tempfile import NamedTemporaryFile
+
+from nose.plugins.skip import SkipTest
+from nose.tools import raises
+
+from pydocx.tests import assert_html_equal, BASE_HTML
+from pydocx.parsers.Docx2Html import Docx2Html
+from pydocx.DocxParser import ZipFile
+from pydocx.exceptions import MalformedDocxException
+
+
+def convert(path, *args, **kwargs):
+ return Docx2Html(path, *args, **kwargs).parsed
+
+
+def test_extract_html():
+ file_path = path.join(
+ path.abspath(path.dirname(__file__)),
+ '..',
+ 'fixtures',
+ 'simple.docx',
+ )
+ actual_html = convert(file_path)
+ assert_html_equal(actual_html, BASE_HTML % '''
+
+ Simple text
+
+
+ - one
+ - two
+ - three
+
+
+
+ | Cell1 |
+ Cell2 |
+
+
+ | Cell3 |
+ Cell4 |
+
+
+ ''')
+
+
+def test_nested_list():
+ file_path = path.join(
+ path.abspath(path.dirname(__file__)),
+ '..',
+ 'fixtures',
+ 'nested_lists.docx',
+ )
+ actual_html = convert(file_path)
+ assert_html_equal(actual_html, BASE_HTML % '''
+
+ - one
+ - two
+ - three
+
+ - AAA
+ - BBB
+ - CCC
+
+ - alpha
+
+
+
+
+ - four
+
+
+ - xxx
+
+ - yyy
+
+
+
+
+ ''')
+
+
+def test_simple_list():
+ file_path = path.join(
+ path.abspath(path.dirname(__file__)),
+ '..',
+ 'fixtures',
+ 'simple_lists.docx',
+ )
+ actual_html = convert(file_path)
+ assert_html_equal(actual_html, BASE_HTML % '''
+
+ - One
+
+
+ ''')
+
+
+def test_inline_tags():
+ file_path = path.join(
+ path.abspath(path.dirname(__file__)),
+ '..',
+ 'fixtures',
+ 'inline_tags.docx',
+ )
+ actual_html = convert(file_path)
+ assert_html_equal(actual_html, BASE_HTML % (
+ 'This sentence has some bold, '
+ 'some italics and some '
+ 'underline, '
+ 'as well as a hyperlink.
'
+ ))
+
+
+def test_all_configured_styles():
+ file_path = path.join(
+ path.abspath(path.dirname(__file__)),
+ '..',
+ 'fixtures',
+ 'all_configured_styles.docx',
+ )
+ actual_html = convert(file_path)
+ assert_html_equal(actual_html, BASE_HTML % '''
+ aaa
+ bbb
+ ccc
+ ddd
+ eee
+ fff
+ ggg
+ hhh
+ iii
+ ''')
+
+
+def test_super_and_subscript():
+ file_path = path.join(
+ path.abspath(path.dirname(__file__)),
+ '..',
+ 'fixtures',
+ 'super_and_subscript.docx',
+ )
+ actual_html = convert(file_path)
+ assert_html_equal(actual_html, BASE_HTML % '''
+ AAABBB
+ CCCDDD
+ ''')
+
+
+def test_unicode():
+ file_path = path.join(
+ path.abspath(path.dirname(__file__)),
+ '..',
+ 'fixtures',
+ 'greek_alphabet.docx',
+ )
+ actual_html = convert(file_path)
+ assert actual_html is not None
+ assert u'\u0391\u03b1' in actual_html
+
+
+def test_special_chars():
+ file_path = path.join(
+ path.abspath(path.dirname(__file__)),
+ '..',
+ 'fixtures',
+ 'special_chars.docx',
+ )
+ actual_html = convert(file_path)
+ assert_html_equal(actual_html, BASE_HTML % '''
+ & < > link
''') # noqa
+
+
+def test_include_tabs():
+ file_path = path.join(
+ path.abspath(path.dirname(__file__)),
+ '..',
+ 'fixtures',
+ 'include_tabs.docx',
+ )
+ actual_html = convert(file_path)
+ assert_html_equal(
+ actual_html,
+ BASE_HTML % 'AAA BBB
'
+ )
+
+
+def test_table_col_row_span():
+ file_path = path.join(
+ path.abspath(path.dirname(__file__)),
+ '..',
+ 'fixtures',
+ 'table_col_row_span.docx',
+ )
+ actual_html = convert(file_path)
+ assert_html_equal(actual_html, BASE_HTML % '''
+
+
+ | AAA |
+
+
+ | BBB |
+ CCC |
+
+
+ | DDD |
+
+
+ |
+ EEE
+ |
+ FFF |
+
+
+ |
+ GGG
+ |
+
+
+
+
+ | 1 |
+ 2 |
+ 3 |
+ 4 |
+
+
+ | 5 |
+ 6 |
+ 7 |
+
+
+ | 8 |
+ 9 |
+
+
+ | 10 |
+ 11 |
+ 12 |
+ 13 |
+
+
+ ''')
+
+
+def test_nested_table_rowspan():
+ file_path = path.join(
+ path.abspath(path.dirname(__file__)),
+ '..',
+ 'fixtures',
+ 'nested_table_rowspan.docx',
+ )
+ actual_html = convert(file_path)
+ assert_html_equal(actual_html, BASE_HTML % '''
+
+
+ | AAA |
+
+
+ | BBB |
+
+
+
+ | CCC |
+ DDD |
+
+
+ | EEE |
+
+
+ |
+
+
+ ''')
+
+
+def test_nested_tables():
+ file_path = path.join(
+ path.abspath(path.dirname(__file__)),
+ '..',
+ 'fixtures',
+ 'nested_tables.docx',
+ )
+ actual_html = convert(file_path)
+ # Find out why br tag is there.
+ assert_html_equal(actual_html, BASE_HTML % '''
+
+
+ | AAA |
+ BBB |
+
+
+ | CCC |
+
+
+
+ | DDD |
+ EEE |
+
+
+ | FFF |
+ GGG |
+
+
+ |
+
+
+ ''')
+
+
+def test_list_in_table():
+ file_path = path.join(
+ path.abspath(path.dirname(__file__)),
+ '..',
+ 'fixtures',
+ 'list_in_table.docx',
+ )
+ actual_html = convert(file_path)
+ assert_html_equal(actual_html, BASE_HTML % '''
+
+
+
+
+ - AAA
+ - BBB
+ - CCC
+
+ |
+
+
+ ''')
+
+
+def test_tables_in_lists():
+ file_path = path.join(
+ path.abspath(path.dirname(__file__)),
+ '..',
+ 'fixtures',
+ 'tables_in_lists.docx',
+ )
+ actual_html = convert(file_path)
+ assert_html_equal(actual_html, BASE_HTML % '''
+
+ - AAA
+ - BBB
+
+
+ | CCC |
+ DDD |
+
+
+ | EEE |
+ FFF |
+
+
+
+ - GGG
+
+ ''')
+
+
+def test_track_changes_on():
+ file_path = path.join(
+ path.abspath(path.dirname(__file__)),
+ '..',
+ 'fixtures',
+ 'track_changes_on.docx',
+ )
+ actual_html = convert(file_path)
+ assert_html_equal(actual_html, BASE_HTML % '''
+ This was some content.
+ ''')
+
+
+def test_headers():
+ file_path = path.join(
+ path.abspath(path.dirname(__file__)),
+ '..',
+ 'fixtures',
+ 'headers.docx',
+ )
+ actual_html = convert(file_path)
+ assert_html_equal(actual_html, BASE_HTML % '''
+ This is an H1
+ This is an H2
+ This is an H3
+ This is an H4
+ This is an H5
+ This is an H6
+ This is an H7
+ This is an H8
+ This is an H9
+ This is an H10
+ ''')
+
+
+def test_split_headers():
+ file_path = path.join(
+ path.abspath(path.dirname(__file__)),
+ '..',
+ 'fixtures',
+ 'split_header.docx',
+ )
+
+ actual_html = convert(file_path)
+ assert_html_equal(actual_html, BASE_HTML % '''
+ AAA
BBB
CCC
+ ''')
+
+
+def get_image_data(docx_file_path, image_name):
+ """
+ Return base 64 encoded data for the image_name that is stored in the
+ docx_file_path.
+ """
+ with ZipFile(docx_file_path) as f:
+ images = [
+ e for e in f.infolist()
+ if e.filename == 'word/media/%s' % image_name
+ ]
+ if not images:
+ raise AssertionError('%s not in %s' % (image_name, docx_file_path))
+ data = f.read(images[0].filename)
+ return base64.b64encode(data)
+
+
+def test_has_image():
+ file_path = path.join(
+ path.abspath(path.dirname(__file__)),
+ '..',
+ 'fixtures',
+ 'has_image.docx',
+ )
+
+ actual_html = convert(file_path)
+ image_data = get_image_data(file_path, 'image1.gif')
+ assert_html_equal(actual_html, BASE_HTML % '''
+
+ AAA
+
+
+ ''' % image_data)
+
+
+def test_local_dpi():
+ # The image in this file does not have a set height or width, show that the
+ # html will generate without it.
+ file_path = path.join(
+ path.abspath(path.dirname(__file__)),
+ '..',
+ 'fixtures',
+ 'localDpi.docx',
+ )
+ actual_html = convert(file_path)
+ image_data = get_image_data(file_path, 'image1.jpeg')
+ assert_html_equal(actual_html, BASE_HTML % '''
+ 
+ ''' % image_data)
+
+
+def test_has_image_using_image_handler():
+ raise SkipTest('This needs to be converted to an xml test')
+ file_path = path.join(
+ path.abspath(path.dirname(__file__)),
+ '..',
+ 'fixtures',
+ 'has_image.docx',
+ )
+
+ def image_handler(*args, **kwargs):
+ return 'test'
+ actual_html = convert(file_path)
+ assert_html_equal(actual_html, BASE_HTML % '''
+ AAA
+ ''')
+
+
+def test_headers_with_full_line_styles():
+ raise SkipTest('This test is not yet passing')
+ # Show that if a natural header is completely bold/italics that
+ # bold/italics will get stripped out.
+ file_path = path.join(
+ path.abspath(path.dirname(__file__)),
+ '..',
+ 'fixtures',
+ 'headers_with_full_line_styles.docx',
+ )
+ actual_html = convert(file_path)
+ assert_html_equal(actual_html, BASE_HTML % '''
+ AAA
+ BBB
+ CCC
+ ''')
+
+
+def test_convert_p_to_h():
+ raise SkipTest('This test is not yet passing')
+ # Show when it is correct to convert a p tag to an h tag based on
+ # bold/italics
+ file_path = path.join(
+ path.abspath(path.dirname(__file__)),
+ '..',
+ 'fixtures',
+ 'convert_p_to_h.docx',
+ )
+ actual_html = convert(file_path)
+ assert_html_equal(actual_html, BASE_HTML % '''
+ AAA
+ BBB
+ CCC
+
+ - DDD
+ - EEE
+ - FFF
+
+
+
+ | GGG |
+ HHH |
+
+
+ | III |
+ JJJ |
+
+
+ ''')
+
+
+def test_fake_headings_by_length():
+ raise SkipTest('This test is not yet passing')
+ # Show that converting p tags to h tags has a length limit. If the p tag is
+ # supposed to be converted to an h tag but has more than seven words in the
+ # paragraph do not convert it.
+ file_path = path.join(
+ path.abspath(path.dirname(__file__)),
+ '..',
+ 'fixtures',
+ 'fake_headings_by_length.docx',
+ )
+ actual_html = convert(file_path)
+ assert_html_equal(actual_html, BASE_HTML % '''
+ Heading.
+ Still a heading.
+
+ This is not a heading because it is too many words.
+
+ ''')
+
+
+def test_shift_enter():
+ file_path = path.join(
+ path.abspath(path.dirname(__file__)),
+ '..',
+ 'fixtures',
+ 'shift_enter.docx',
+ )
+
+ # Test just the convert without clean_html to make sure the first
+ # break tag is present.
+ actual_html = convert(file_path)
+ assert_html_equal(actual_html, BASE_HTML % '''
+ AAA
BBB
+ CCC
+
+ - DDD
EEE
+ - FFF
+
+
+
+ GGG HHH |
+ III JJJ |
+
+
+ | KKK |
+ LLL |
+
+
+ ''')
+
+
+def test_lists_with_styles():
+ file_path = path.join(
+ path.abspath(path.dirname(__file__)),
+ '..',
+ 'fixtures',
+ 'lists_with_styles.docx',
+ )
+ actual_html = convert(file_path)
+ assert_html_equal(actual_html, BASE_HTML % '''
+
+ - AAA
+ - BBB
+
+ - CCC
+ - DDD
+
+ - EEE
+
+ - FFF
+
+
+
+
+
+
+
+ ''')
+
+
+def test_list_to_header():
+ file_path = path.join(
+ path.abspath(path.dirname(__file__)),
+ '..',
+ 'fixtures',
+ 'list_to_header.docx',
+ )
+ actual_html = convert(file_path, convert_root_level_upper_roman=True)
+ # It should be noted that list item `GGG` is upper roman in the word
+ # document to show that only top level upper romans get converted.
+ assert_html_equal(actual_html, BASE_HTML % '''
+ AAA
+
+ - BBB
+
+ CCC
+
+ - DDD
+
+ EEE
+
+ - FFF
+
+ - GGG
+
+
+
+ ''')
+
+
+def test_has_title():
+ file_path = path.join(
+ path.abspath(path.dirname(__file__)),
+ '..',
+ 'fixtures',
+ 'has_title.docx',
+ )
+ actual_html = convert(file_path)
+ assert_html_equal(actual_html, BASE_HTML % '''
+ Title
+ Text
+ ''')
+
+
+def test_upper_alpha_all_bold():
+ raise SkipTest('This test is not yet passing')
+ file_path = path.join(
+ path.abspath(path.dirname(__file__)),
+ '..',
+ 'fixtures',
+ 'upper_alpha_all_bold.docx',
+ )
+ actual_html = convert(file_path)
+ assert_html_equal(actual_html, BASE_HTML % '''
+ AAA
+ BBB
+ CCC
+ ''')
+
+
+def test_simple_table():
+ file_path = path.join(
+ path.abspath(path.dirname(__file__)),
+ '..',
+ 'fixtures',
+ 'simple_table.docx',
+ )
+ actual_html = convert(file_path)
+ assert_html_equal(actual_html, BASE_HTML % '''
+
+
+
+ Cell1
+ Cell3
+ |
+ Cell2
+ And I am writing in the table
+ |
+
+
+ | Cell4 |
+
+
+ ''')
+
+
+def test_justification():
+ file_path = path.join(
+ path.abspath(path.dirname(__file__)),
+ '..',
+ 'fixtures',
+ 'justification.docx',
+ )
+ actual_html = convert(file_path)
+ assert_html_equal(actual_html, BASE_HTML % '''
+
+
Center Justified
+
+
+
Right justified
+
+
+
+ Right justified and pushed in from right
+
+
+
+
+ Center justified and pushed in from left and it is
+ great and it is the coolest thing of all time and I like it and
+ I think it is cool
+
+
+
+
+ Left justified and pushed in from left
+
+
+ ''')
+
+
+def test_missing_style():
+ file_path = path.join(
+ path.abspath(path.dirname(__file__)),
+ '..',
+ 'fixtures',
+ 'missing_style.docx',
+ )
+ actual_html = convert(file_path)
+ assert_html_equal(actual_html, BASE_HTML % '''
+ AAA
+ ''')
+
+
+def test_missing_numbering():
+ file_path = path.join(
+ path.abspath(path.dirname(__file__)),
+ '..',
+ 'fixtures',
+ 'missing_numbering.docx',
+ )
+ actual_html = convert(file_path)
+ assert_html_equal(actual_html, BASE_HTML % '''
+ AAA
+ BBB
+ ''')
+
+
+def test_styled_bolding():
+ file_path = path.join(
+ path.abspath(path.dirname(__file__)),
+ '..',
+ 'fixtures',
+ 'styled_bolding.docx',
+ )
+ actual_html = convert(file_path)
+ assert_html_equal(actual_html, BASE_HTML % '''
+ AAA
+ BBB
+ ''')
+
+
+def test_no_break_hyphen():
+ file_path = path.join(
+ path.abspath(path.dirname(__file__)),
+ '..',
+ 'fixtures',
+ 'no_break_hyphen.docx',
+ )
+ actual_html = convert(file_path)
+ assert_html_equal(actual_html, BASE_HTML % '''
+ AAA-BBB
+ ''')
+
+
+@raises(MalformedDocxException)
+def test_malformed_docx_exception():
+ with NamedTemporaryFile(suffix='.docx') as f:
+ convert(f.name)
+
+
+def _converter(*args, **kwargs):
+ # Having a converter that does nothing is the same as if abiword fails to
+ # convert.
+ pass
+
+
+#def test_converter_broken():
+# file_path = 'test.doc'
+# assert_raises(
+# ConversionFailed,
+# lambda: convert(file_path, converter=_converter),
+# )
+
+
+def test_fall_back():
+ raise SkipTest('This test is not yet passing')
+ file_path = 'test.doc'
+
+ def fall_back(*args, **kwargs):
+ return 'success'
+ html = convert(file_path, fall_back=fall_back, converter=_converter)
+ assert html == 'success'
+
+
+#@mock.patch('docx2html.core.read_html_file')
+#@mock.patch('docx2html.core.get_zip_file_handler')
+#def test_html_files(patch_zip_handler, patch_read):
+def test_html_files():
+ raise SkipTest('This test is not yet passing')
+
+ def raise_assertion(*args, **kwargs):
+ raise AssertionError('Should not have called get_zip_file_handler')
+ #patch_zip_handler.side_effect = raise_assertion
+
+ def return_text(*args, **kwargs):
+ return 'test'
+ #patch_read.side_effect = return_text
+
+ # Try with an html file
+ file_path = 'test.html'
+
+ html = convert(file_path)
+ assert html == 'test'
+
+ # Try again with an htm file.
+ file_path = 'test.htm'
+
+ html = convert(file_path)
+ assert html == 'test'
diff --git a/pydocx/tests/test_xml.py b/pydocx/tests/test_xml.py
new file mode 100644
index 00000000..904ed2b4
--- /dev/null
+++ b/pydocx/tests/test_xml.py
@@ -0,0 +1,1351 @@
+# -*- coding: utf-8 -*-
+import os
+import time
+
+from nose.plugins.skip import SkipTest
+
+from pydocx.tests.document_builder import DocxBuilder as DXB
+from pydocx.tests import (
+ XMLDocx2Html,
+ _TranslationTestCase,
+)
+from pydocx.utils import parse_xml_from_string, find_all
+
+
+class StyleIsOnTestCase(_TranslationTestCase):
+ expected_output = """
+ AAA
+ BBB
+ CCC
+ DDD
+ """
+
+ def get_xml(self):
+ tags = [
+ DXB.p_tag(
+ [
+ DXB.r_tag(
+ [DXB.t_tag('AAA')],
+ rpr=DXB.rpr_tag({'b': None}),
+ ),
+ ],
+ ),
+ DXB.p_tag(
+ [
+ DXB.r_tag(
+ [DXB.t_tag('BBB')],
+ rpr=DXB.rpr_tag({'b': 'false'}),
+ ),
+ ],
+ ),
+ DXB.p_tag(
+ [
+ DXB.r_tag(
+ [DXB.t_tag('CCC')],
+ rpr=DXB.rpr_tag({'b': '0'}),
+ ),
+ ],
+ ),
+ DXB.p_tag(
+ [
+ DXB.r_tag(
+ [DXB.t_tag('DDD')],
+ rpr=DXB.rpr_tag({'u': 'none'}),
+ ),
+ ],
+ ),
+ ]
+
+ body = ''
+ for tag in tags:
+ body += tag
+ xml = DXB.xml(body)
+ return xml
+
+
+class HyperlinkVanillaTestCase(_TranslationTestCase):
+
+ relationship_dict = {
+ 'rId0': 'www.google.com',
+ }
+
+ expected_output = '''
+ link.
+ '''
+
+ def get_xml(self):
+ run_tags = []
+ run_tags.append(DXB.r_tag([DXB.t_tag('link')]))
+ run_tags = [DXB.hyperlink_tag(r_id='rId0', run_tags=run_tags)]
+ run_tags.append(DXB.r_tag([DXB.t_tag('.')]))
+ body = DXB.p_tag(run_tags)
+ xml = DXB.xml(body)
+ return xml
+
+
+class HyperlinkWithMultipleRunsTestCase(_TranslationTestCase):
+ relationship_dict = {
+ 'rId0': 'www.google.com',
+ }
+
+ expected_output = '''
+ link.
+ '''
+
+ def get_xml(self):
+ run_tags = [DXB.r_tag([DXB.t_tag(i)]) for i in 'link']
+ run_tags = [DXB.hyperlink_tag(r_id='rId0', run_tags=run_tags)]
+ run_tags.append(DXB.r_tag([DXB.t_tag('.')]))
+ body = DXB.p_tag(run_tags)
+ xml = DXB.xml(body)
+ return xml
+
+
+class HyperlinkNoTextTestCase(_TranslationTestCase):
+ relationship_dict = {
+ 'rId0': 'www.google.com',
+ }
+
+ expected_output = ''
+
+ def get_xml(self):
+ run_tags = []
+ run_tags = [DXB.hyperlink_tag(r_id='rId0', run_tags=run_tags)]
+ body = DXB.p_tag(run_tags)
+ xml = DXB.xml(body)
+ return xml
+
+
+class HyperlinkNotInRelsDictTestCase(_TranslationTestCase):
+ relationship_dict = {
+ # 'rId0': 'www.google.com', missing
+ }
+
+ expected_output = 'link.
'
+
+ def get_xml(self):
+ run_tags = []
+ run_tags.append(DXB.r_tag([DXB.t_tag('link')]))
+ run_tags = [DXB.hyperlink_tag(r_id='rId0', run_tags=run_tags)]
+ run_tags.append(DXB.r_tag([DXB.t_tag('.')]))
+ body = DXB.p_tag(run_tags)
+ xml = DXB.xml(body)
+ return xml
+
+
+class HyperlinkWithBreakTestCase(_TranslationTestCase):
+ relationship_dict = {
+ 'rId0': 'www.google.com',
+ }
+
+ expected_output = 'link
'
+
+ def get_xml(self):
+ run_tags = []
+ run_tags.append(DXB.r_tag([DXB.t_tag('link')]))
+ run_tags.append(DXB.r_tag([DXB.linebreak()]))
+ run_tags = [DXB.hyperlink_tag(r_id='rId0', run_tags=run_tags)]
+ body = DXB.p_tag(run_tags)
+ xml = DXB.xml(body)
+ return xml
+
+
+class ImageLocal(_TranslationTestCase):
+ relationship_dict = {
+ 'rId0': 'media/image1.jpeg',
+ 'rId1': 'media/image2.jpeg',
+ }
+ expected_output = '''
+ 
+ 
+ '''
+
+ def get_xml(self):
+ drawing = DXB.drawing(height=None, width=None, r_id='rId0')
+ pict = DXB.pict(height=None, width=None, r_id='rId1')
+ tags = [
+ drawing,
+ pict,
+ ]
+ body = ''
+ for el in tags:
+ body += el
+
+ xml = DXB.xml(body)
+ return xml
+
+
+class ImageTestCase(_TranslationTestCase):
+ relationship_dict = {
+ 'rId0': 'media/image1.jpeg',
+ 'rId1': 'media/image2.jpeg',
+ }
+ expected_output = '''
+
+
+
+
+
+
+ '''
+
+ def get_xml(self):
+ drawing = DXB.drawing(height=20, width=40, r_id='rId0')
+ pict = DXB.pict(height=21, width=41, r_id='rId1')
+ tags = [
+ drawing,
+ pict,
+ ]
+ body = ''
+ for el in tags:
+ body += el
+
+ xml = DXB.xml(body)
+ return xml
+
+ def test_get_image_id(self):
+ parser = XMLDocx2Html(
+ document_xml=self.get_xml(),
+ rels_dict=self.relationship_dict,
+ )
+ tree = parse_xml_from_string(self.get_xml())
+ els = []
+ els.extend(find_all(tree, 'drawing'))
+ els.extend(find_all(tree, 'pict'))
+ image_ids = []
+ for el in els:
+ image_ids.append(parser._get_image_id(el))
+ expected = [
+ 'rId0',
+ 'rId1',
+ ]
+ self.assertEqual(
+ set(image_ids),
+ set(expected),
+ )
+
+ def test_get_image_sizes(self):
+ parser = XMLDocx2Html(
+ document_xml=self.get_xml(),
+ rels_dict=self.relationship_dict,
+ )
+ tree = parse_xml_from_string(self.get_xml())
+ els = []
+ els.extend(find_all(tree, 'drawing'))
+ els.extend(find_all(tree, 'pict'))
+ image_ids = []
+ for el in els:
+ image_ids.append(parser._get_image_size(el))
+ expected = [
+ ('40px', '20px'),
+ ('41pt', '21pt'),
+ ]
+ self.assertEqual(
+ set(image_ids),
+ set(expected),
+ )
+
+
+class ImageNotInRelsDictTestCase(_TranslationTestCase):
+ relationship_dict = {
+ # 'rId0': 'media/image1.jpeg',
+ }
+ expected_output = ''
+
+ def get_xml(self):
+ drawing = DXB.drawing(height=20, width=40, r_id='rId0')
+ body = drawing
+
+ xml = DXB.xml(body)
+ return xml
+
+
+class ImageNoSizeTestCase(_TranslationTestCase):
+ relationship_dict = {
+ 'rId0': os.path.join(
+ os.path.abspath(os.path.dirname(__file__)),
+ '..',
+ 'fixtures',
+ 'bullet_go_gray.png',
+ )
+ }
+ image_sizes = {
+ 'rId0': (0, 0),
+ }
+ expected_output = '''
+
+
+
+
+
+ ''' % relationship_dict['rId0']
+
+ @staticmethod
+ def image_handler(image_id, relationship_dict):
+ return relationship_dict.get(image_id)
+
+ def get_xml(self):
+ raise SkipTest(
+ 'Since we are not using PIL, we do not need this test yet.',
+ )
+ drawing = DXB.drawing('rId0')
+ tags = [
+ drawing,
+ ]
+ body = ''
+ for el in tags:
+ body += el
+
+ xml = DXB.xml(body)
+ return xml
+
+
+class TableTag(_TranslationTestCase):
+ expected_output = '''
+
+
+ | AAA |
+ BBB |
+
+
+ | CCC |
+ DDD |
+
+
+ '''
+
+ def get_xml(self):
+ cell1 = DXB.table_cell(paragraph=DXB.p_tag('AAA'))
+ cell2 = DXB.table_cell(paragraph=DXB.p_tag('CCC'))
+ cell3 = DXB.table_cell(paragraph=DXB.p_tag('BBB'))
+ cell4 = DXB.table_cell(paragraph=DXB.p_tag('DDD'))
+ rows = [DXB.table_row([cell1, cell3]), DXB.table_row([cell2, cell4])]
+ table = DXB.table(rows)
+ body = table
+ xml = DXB.xml(body)
+ return xml
+
+
+class RowSpanTestCase(_TranslationTestCase):
+
+ expected_output = '''
+
+
+ | AAA |
+ BBB |
+
+
+ | CCC |
+
+
+ '''
+
+ def get_xml(self):
+ cell1 = DXB.table_cell(
+ paragraph=DXB.p_tag('AAA'), merge=True, merge_continue=False)
+ cell2 = DXB.table_cell(
+ paragraph=DXB.p_tag(None), merge=False, merge_continue=True)
+ cell3 = DXB.table_cell(paragraph=DXB.p_tag('BBB'))
+ cell4 = DXB.table_cell(paragraph=DXB.p_tag('CCC'))
+ rows = [DXB.table_row([cell1, cell3]), DXB.table_row([cell2, cell4])]
+ table = DXB.table(rows)
+ body = table
+ xml = DXB.xml(body)
+ return xml
+
+
+class NestedTableTag(_TranslationTestCase):
+ expected_output = '''
+
+
+ | AAA |
+ BBB |
+
+
+ | CCC |
+
+
+
+ | DDD |
+ EEE |
+
+
+ | FFF |
+ GGG |
+
+
+ |
+
+
+ '''
+
+ def get_xml(self):
+ cell1 = DXB.table_cell(paragraph=DXB.p_tag('DDD'))
+ cell2 = DXB.table_cell(paragraph=DXB.p_tag('FFF'))
+ cell3 = DXB.table_cell(paragraph=DXB.p_tag('EEE'))
+ cell4 = DXB.table_cell(paragraph=DXB.p_tag('GGG'))
+ rows = [DXB.table_row([cell1, cell3]), DXB.table_row([cell2, cell4])]
+ nested_table = DXB.table(rows)
+ cell1 = DXB.table_cell(paragraph=DXB.p_tag('AAA'))
+ cell2 = DXB.table_cell(paragraph=DXB.p_tag('CCC'))
+ cell3 = DXB.table_cell(paragraph=DXB.p_tag('BBB'))
+ cell4 = DXB.table_cell(nested_table)
+ rows = [DXB.table_row([cell1, cell3]), DXB.table_row([cell2, cell4])]
+ table = DXB.table(rows)
+ body = table
+ xml = DXB.xml(body)
+ return xml
+
+
+class TableWithInvalidTag(_TranslationTestCase):
+ expected_output = '''
+
+
+ | AAA |
+ BBB |
+
+
+ |
+ DDD |
+
+
+ '''
+
+ def get_xml(self):
+ cell1 = DXB.table_cell(paragraph=DXB.p_tag('AAA'))
+ cell2 = DXB.table_cell('CCC')
+ cell3 = DXB.table_cell(paragraph=DXB.p_tag('BBB'))
+ cell4 = DXB.table_cell(paragraph=DXB.p_tag('DDD'))
+ rows = [DXB.table_row([cell1, cell3]), DXB.table_row([cell2, cell4])]
+ table = DXB.table(rows)
+ body = table
+ xml = DXB.xml(body)
+ return xml
+
+
+class TableWithListAndParagraph(_TranslationTestCase):
+ expected_output = '''
+
+
+
+
+ - AAA
+ - BBB
+
+ CCC
+ DDD
+ |
+
+
+ '''
+
+ def get_xml(self):
+ li_text = [
+ ('AAA', 0, 1),
+ ('BBB', 0, 1),
+ ]
+ lis = ''
+ for text, ilvl, numId in li_text:
+ lis += DXB.li(text=text, ilvl=ilvl, numId=numId)
+ els = [
+ lis,
+ DXB.p_tag('CCC'),
+ DXB.p_tag('DDD'),
+ ]
+ td = ''
+ for el in els:
+ td += el
+ cell1 = DXB.table_cell(td)
+ row = DXB.table_row([cell1])
+ table = DXB.table([row])
+ body = table
+ xml = DXB.xml(body)
+ return xml
+
+
+class SimpleListTestCase(_TranslationTestCase):
+ expected_output = '''
+
+ - AAA
+ - BBB
+ - CCC
+
+ '''
+
+ # Ensure its not failing somewhere and falling back to decimal
+ numbering_dict = {
+ '1': {
+ '0': 'lowerLetter',
+ }
+ }
+
+ def get_xml(self):
+ li_text = [
+ ('AAA', 0, 1),
+ ('BBB', 0, 1),
+ ('CCC', 0, 1),
+ ]
+ lis = ''
+ for text, ilvl, numId in li_text:
+ lis += DXB.li(text=text, ilvl=ilvl, numId=numId)
+
+ xml = DXB.xml(lis)
+ return xml
+
+
+class SingleListItemTestCase(_TranslationTestCase):
+ expected_output = '''
+
+ - AAA
+
+ '''
+
+ # Ensure its not failing somewhere and falling back to decimal
+ numbering_dict = {
+ '1': {
+ '0': 'lowerLetter',
+ }
+ }
+
+ def get_xml(self):
+ li_text = [
+ ('AAA', 0, 1),
+ ]
+ lis = ''
+ for text, ilvl, numId in li_text:
+ lis += DXB.li(text=text, ilvl=ilvl, numId=numId)
+
+ xml = DXB.xml(lis)
+ return xml
+
+
+class ListWithContinuationTestCase(_TranslationTestCase):
+ expected_output = '''
+
+ - AAA
BBB
+ - CCC
+
+
+ | DDD |
+ EEE |
+
+
+ | FFF |
+ GGG |
+
+
+
+ - HHH
+
+ '''
+
+ def get_xml(self):
+ cell1 = DXB.table_cell(paragraph=DXB.p_tag('DDD'))
+ cell2 = DXB.table_cell(paragraph=DXB.p_tag('FFF'))
+ cell3 = DXB.table_cell(paragraph=DXB.p_tag('EEE'))
+ cell4 = DXB.table_cell(paragraph=DXB.p_tag('GGG'))
+ rows = [DXB.table_row([cell1, cell3]), DXB.table_row([cell2, cell4])]
+ table = DXB.table(rows)
+ tags = [
+ DXB.li(text='AAA', ilvl=0, numId=1),
+ DXB.p_tag('BBB'),
+ DXB.li(text='CCC', ilvl=0, numId=1),
+ table,
+ DXB.li(text='HHH', ilvl=0, numId=1),
+ ]
+ body = ''
+ for el in tags:
+ body += el
+
+ xml = DXB.xml(body)
+ return xml
+
+
+class ListWithMultipleContinuationTestCase(_TranslationTestCase):
+ expected_output = '''
+
+ - AAA
+
+
+
+ - DDD
+
+ '''
+
+ def get_xml(self):
+ cell = DXB.table_cell(paragraph=DXB.p_tag('BBB'))
+ row = DXB.table_row([cell])
+ table1 = DXB.table([row])
+ cell = DXB.table_cell(paragraph=DXB.p_tag('CCC'))
+ row = DXB.table_row([cell])
+ table2 = DXB.table([row])
+ tags = [
+ DXB.li(text='AAA', ilvl=0, numId=1),
+ table1,
+ table2,
+ DXB.li(text='DDD', ilvl=0, numId=1),
+ ]
+ body = ''
+ for el in tags:
+ body += el
+
+ xml = DXB.xml(body)
+ return xml
+
+
+class MangledIlvlTestCase(_TranslationTestCase):
+ expected_output = '''
+
+ - AAA
+
+
+ - BBB
+
+ - CCC
+
+
+
+ '''
+
+ def get_xml(self):
+ li_text = [
+ ('AAA', 0, 2),
+ ('BBB', 1, 1),
+ ('CCC', 0, 1),
+ ]
+ lis = ''
+ for text, ilvl, numId in li_text:
+ lis += DXB.li(text=text, ilvl=ilvl, numId=numId)
+
+ xml = DXB.xml(lis)
+ return xml
+
+
+class SeperateListsTestCase(_TranslationTestCase):
+ expected_output = '''
+
+ - AAA
+
+
+ - BBB
+
+
+ - CCC
+
+ '''
+
+ def get_xml(self):
+ li_text = [
+ ('AAA', 0, 2),
+ # Because AAA and CCC are part of the same list (same list id)
+ # and BBB is different, these need to be split into three
+ # lists (or lose everything from BBB and after.
+ ('BBB', 0, 1),
+ ('CCC', 0, 2),
+ ]
+ lis = ''
+ for text, ilvl, numId in li_text:
+ lis += DXB.li(text=text, ilvl=ilvl, numId=numId)
+
+ xml = DXB.xml(lis)
+ return xml
+
+
+class InvalidIlvlOrderTestCase(_TranslationTestCase):
+ expected_output = '''
+
+ - AAA
+
+ - BBB
+
+ - CCC
+
+
+
+
+
+ '''
+
+ def get_xml(self):
+ tags = [
+ DXB.li(text='AAA', ilvl=1, numId=1),
+ DXB.li(text='BBB', ilvl=3, numId=1),
+ DXB.li(text='CCC', ilvl=2, numId=1),
+ ]
+ body = ''
+ for el in tags:
+ body += el
+
+ xml = DXB.xml(body)
+ return xml
+
+
+class DeeplyNestedTableTestCase(_TranslationTestCase):
+ expected_output = ''
+ run_expected_output = False
+
+ def get_xml(self):
+ paragraph = DXB.p_tag('AAA')
+
+ for _ in range(1000):
+ cell = DXB.table_cell(paragraph)
+ row = DXB.table_cell([cell])
+ table = DXB.table([row])
+ body = table
+ xml = DXB.xml(body)
+ return xml
+
+ def test_performance(self):
+ with self.toggle_run_expected_output():
+ start_time = time.time()
+ try:
+ self.test_expected_output()
+ except AssertionError:
+ pass
+ end_time = time.time()
+ total_time = end_time - start_time
+ # This finishes in under a second on python 2.7
+ assert total_time < 3, total_time
+
+
+class LargeCellTestCase(_TranslationTestCase):
+ expected_output = ''
+ run_expected_output = False
+
+ def get_xml(self):
+ # Make sure it is over 1000 (which is the recursion limit)
+ paragraphs = [DXB.p_tag('%d' % i) for i in range(1000)]
+ cell = DXB.table_cell(paragraphs)
+ row = DXB.table_cell([cell])
+ table = DXB.table([row])
+ body = table
+ xml = DXB.xml(body)
+ return xml
+
+ def test_performance(self):
+ with self.toggle_run_expected_output():
+ start_time = time.time()
+ try:
+ self.test_expected_output()
+ except AssertionError:
+ pass
+ end_time = time.time()
+ total_time = end_time - start_time
+ # This finishes in under a second on python 2.7
+ assert total_time < 3, total_time
+
+
+class NonStandardTextTagsTestCase(_TranslationTestCase):
+ expected_output = '''
+ insert
+ smarttag
+ '''
+
+ def get_xml(self):
+ run_tags = [DXB.r_tag([DXB.t_tag(i)]) for i in 'insert ']
+ insert_tag = DXB.insert_tag(run_tags)
+ run_tags = [DXB.r_tag([DXB.t_tag(i)]) for i in 'smarttag']
+ smart_tag = DXB.smart_tag(run_tags)
+
+ run_tags = [insert_tag, smart_tag]
+ body = DXB.p_tag(run_tags)
+ xml = DXB.xml(body)
+ return xml
+
+
+class RTagWithNoText(_TranslationTestCase):
+ expected_output = ''
+
+ def get_xml(self):
+ p_tag = DXB.p_tag(None) # No text
+ run_tags = [p_tag]
+ # The bug is only present in a hyperlink
+ run_tags = [DXB.hyperlink_tag(r_id='rId0', run_tags=run_tags)]
+ body = DXB.p_tag(run_tags)
+
+ xml = DXB.xml(body)
+ return xml
+
+
+class DeleteTagInList(_TranslationTestCase):
+ expected_output = '''
+
+ - AAA
+ BBB
+
+ - CCC
+
+ '''
+
+ def get_xml(self):
+ delete_tags = DXB.delete_tag(['BBB'])
+ p_tag = DXB.p_tag([delete_tags])
+
+ body = DXB.li(text='AAA', ilvl=0, numId=0)
+ body += p_tag
+ body += DXB.li(text='CCC', ilvl=0, numId=0)
+
+ xml = DXB.xml(body)
+ return xml
+
+
+class InsertTagInList(_TranslationTestCase):
+ expected_output = '''
+
+ - AAABBB
+
+ - CCC
+
+ '''
+
+ def get_xml(self):
+ run_tags = [DXB.r_tag([DXB.t_tag(i)]) for i in 'BBB']
+ insert_tags = DXB.insert_tag(run_tags)
+ p_tag = DXB.p_tag([insert_tags])
+
+ body = DXB.li(text='AAA', ilvl=0, numId=0)
+ body += p_tag
+ body += DXB.li(text='CCC', ilvl=0, numId=0)
+
+ xml = DXB.xml(body)
+ return xml
+
+
+class SmartTagInList(_TranslationTestCase):
+ expected_output = '''
+
+ - AAABBB
+
+ - CCC
+
+ '''
+
+ def get_xml(self):
+ run_tags = [DXB.r_tag([DXB.t_tag(i)]) for i in 'BBB']
+ smart_tag = DXB.smart_tag(run_tags)
+ p_tag = DXB.p_tag([smart_tag])
+
+ body = DXB.li(text='AAA', ilvl=0, numId=0)
+ body += p_tag
+ body += DXB.li(text='CCC', ilvl=0, numId=0)
+
+ xml = DXB.xml(body)
+ return xml
+
+
+class SingleListItem(_TranslationTestCase):
+ expected_output = '''
+
+ - AAA
+
+ BBB
+ '''
+
+ numbering_dict = {
+ '1': {
+ '0': 'lowerLetter',
+ }
+ }
+
+ def get_xml(self):
+ li = DXB.li(text='AAA', ilvl=0, numId=1)
+ p_tags = [
+ DXB.p_tag('BBB'),
+ ]
+ body = li
+ for p_tag in p_tags:
+ body += p_tag
+ xml = DXB.xml(body)
+ return xml
+
+
+class SimpleTableTest(_TranslationTestCase):
+ expected_output = '''
+
+
+ | Blank |
+ Column 1 |
+ Column 2 |
+
+
+ | Row 1 |
+ First |
+ Second |
+
+
+ | Row 2 |
+ Third |
+ Fourth |
+
+
'''
+
+ def get_xml(self):
+ cell1 = DXB.table_cell(paragraph=DXB.p_tag('Blank'))
+ cell2 = DXB.table_cell(paragraph=DXB.p_tag('Row 1'))
+ cell3 = DXB.table_cell(paragraph=DXB.p_tag('Row 2'))
+ cell4 = DXB.table_cell(paragraph=DXB.p_tag('Column 1'))
+ cell5 = DXB.table_cell(paragraph=DXB.p_tag('First'))
+ cell6 = DXB.table_cell(paragraph=DXB.p_tag('Third'))
+ cell7 = DXB.table_cell(paragraph=DXB.p_tag('Column 2'))
+ cell8 = DXB.table_cell(paragraph=DXB.p_tag('Second'))
+ cell9 = DXB.table_cell(paragraph=DXB.p_tag('Fourth'))
+ rows = [DXB.table_row([cell1, cell4, cell7]),
+ DXB.table_row([cell2, cell5, cell8]),
+ DXB.table_row([cell3, cell6, cell9])]
+ table = DXB.table(rows)
+ body = table
+ xml = DXB.xml(body)
+ return xml
+
+
+class MissingIlvl(_TranslationTestCase):
+ expected_output = '''
+
+ - AAA
+ BBB
+
+ - CCC
+
+ '''
+
+ def get_xml(self):
+ li_text = [
+ ('AAA', 0, 1),
+ ('BBB', None, 1), # Because why not.
+ ('CCC', 0, 1),
+ ]
+ lis = ''
+ for text, ilvl, numId in li_text:
+ lis += DXB.li(text=text, ilvl=ilvl, numId=numId)
+ body = lis
+ xml = DXB.xml(body)
+ return xml
+
+
+class SameNumIdInTable(_TranslationTestCase):
+ expected_output = '''
+
+ - AAA
+
+
+ - CCC
+
+ '''
+
+ # Ensure its not failing somewhere and falling back to decimal
+ numbering_dict = {
+ '1': {
+ '0': 'lowerLetter',
+ }
+ }
+
+ def get_xml(self):
+ li_text = [
+ ('BBB', 0, 1),
+ ]
+ lis = ''
+ for text, ilvl, numId in li_text:
+ lis += DXB.li(text=text, ilvl=ilvl, numId=numId)
+ cell1 = DXB.table_cell(lis)
+ rows = DXB.table_row([cell1])
+ table = DXB.table([rows])
+ lis = ''
+ lis += DXB.li(text='AAA', ilvl=0, numId=1)
+ lis += table
+ lis += DXB.li(text='CCC', ilvl=0, numId=1)
+ body = lis
+ xml = DXB.xml(body)
+ return xml
+
+
+class SDTTestCase(_TranslationTestCase):
+ expected_output = '''
+
+ - AAABBB
+
+ - CCC
+
+ '''
+
+ def get_xml(self):
+ body = ''
+ body += DXB.li(text='AAA', ilvl=0, numId=0)
+ body += DXB.sdt_tag(p_tag=DXB.p_tag(text='BBB'))
+ body += DXB.li(text='CCC', ilvl=0, numId=0)
+
+ xml = DXB.xml(body)
+ return xml
+
+
+class HeadingTestCase(_TranslationTestCase):
+ expected_output = '''
+ AAA
+ BBB
+ CCC
+ DDD
+ EEE
+ GGG
+ HHH
+ '''
+
+ styles_dict = {
+ 'style0': {
+ 'style_name': 'heading 1',
+ },
+ 'style1': {
+ 'style_name': 'heading 2',
+ },
+ 'style2': {
+ 'style_name': 'heading 3',
+ },
+ 'style3': {
+ 'style_name': 'heading 4',
+ },
+ 'style4': {
+ 'style_name': 'heading 5',
+ },
+ 'style5': {
+ 'style_name': 'heading 6',
+ },
+ }
+
+ def get_xml(self):
+ p_tags = [
+ DXB.p_tag(text='AAA', style='style0'),
+ DXB.p_tag(text='BBB', style='style1'),
+ DXB.p_tag(text='CCC', style='style2'),
+ DXB.p_tag(text='DDD', style='style3'),
+ DXB.p_tag(text='EEE', style='style4'),
+ DXB.p_tag(text='GGG', style='style5'),
+ DXB.p_tag(text='HHH', style='garbage'),
+ ]
+ body = ''
+ for tag in p_tags:
+ body += tag
+
+ xml = DXB.xml(body)
+ return xml
+
+
+class StyledBoldingTestCase(_TranslationTestCase):
+ expected_output = '''
+ AAA
+ BBB
+ CCC
+ '''
+
+ styles_dict = {
+ 'style0': {
+ 'style_name': 'p1',
+ 'default_run_properties': {
+ 'b': '',
+ }
+ },
+ }
+
+ def get_xml(self):
+ p_tags = [
+ DXB.p_tag(text='AAA', style='style0'),
+ DXB.p_tag(
+ [
+ DXB.r_tag(
+ [DXB.t_tag('BBB')],
+ # Don't do duplicates
+ rpr=DXB.rpr_tag({'b': None}),
+ ),
+ ],
+ style='style0',
+ ),
+ DXB.p_tag(
+ [
+ DXB.r_tag(
+ [DXB.t_tag('CCC')],
+ # Overwrite the current style
+ rpr=DXB.rpr_tag({'b': 'false'}),
+ ),
+ ],
+ style='style0',
+ ),
+ ]
+ body = ''
+ for tag in p_tags:
+ body += tag
+
+ xml = DXB.xml(body)
+ return xml
+
+
+class RomanNumeralToHeadingTestCase(_TranslationTestCase):
+ convert_root_level_upper_roman = True
+ numbering_dict = {
+ '1': {
+ '0': 'upperRoman',
+ '1': 'decimal',
+ '2': 'upperRoman',
+ },
+ '2': {
+ '0': 'upperRoman',
+ '1': 'decimal',
+ '2': 'upperRoman',
+ },
+ '3': {
+ '0': 'upperRoman',
+ '1': 'decimal',
+ '2': 'upperRoman',
+ },
+ }
+ expected_output = '''
+ AAA
+
+ - BBB
+
+ CCC
+
+ - DDD
+
+ EEE
+
+ - FFF
+
+ - GGG
+
+
+
+ '''
+
+ def get_xml(self):
+ li_text = [
+ ('AAA', 0, 1),
+ ('BBB', 1, 1),
+ ('CCC', 0, 2),
+ ('DDD', 1, 2),
+ ('EEE', 0, 3),
+ ('FFF', 1, 3),
+ ('GGG', 2, 3),
+ ]
+ body = ''
+ for text, ilvl, numId in li_text:
+ body += DXB.li(text=text, ilvl=ilvl, numId=numId)
+
+ xml = DXB.xml(body)
+ return xml
+
+
+class MultipleTTagsInRTag(_TranslationTestCase):
+ expected_output = '''
+ ABC
+ '''
+
+ def get_xml(self):
+ r_tag = DXB.r_tag(
+ [DXB.t_tag(letter) for letter in 'ABC'],
+ )
+ p_tag = DXB.p_tag(
+ [r_tag],
+ jc='start',
+ )
+ body = p_tag
+
+ xml = DXB.xml(body)
+ return xml
+
+
+class SuperAndSubScripts(_TranslationTestCase):
+ expected_output = '''
+ AAABBB
+ CCCDDD
+ '''
+
+ def get_xml(self):
+ p_tags = [
+ DXB.p_tag(
+ [
+ DXB.r_tag([DXB.t_tag('AAA')]),
+ DXB.r_tag(
+ [DXB.t_tag('BBB')],
+ rpr=DXB.rpr_tag({'vertAlign': 'superscript'}),
+ ),
+ ],
+ ),
+ DXB.p_tag(
+ [
+ DXB.r_tag(
+ [DXB.t_tag('CCC')],
+ rpr=DXB.rpr_tag({'vertAlign': 'subscript'}),
+ ),
+ DXB.r_tag([DXB.t_tag('DDD')]),
+ ],
+ ),
+ ]
+ body = ''
+ for p_tag in p_tags:
+ body += p_tag
+
+ xml = DXB.xml(body)
+ return xml
+
+
+class AvaliableInlineTags(_TranslationTestCase):
+ expected_output = '''
+ aaa
+ bbb
+ ccc
+ ddd
+ eee
+ fff
+ ggg
+ hhh
+ iii
+ jjj
+ '''
+
+ def get_xml(self):
+ p_tags = [
+ DXB.p_tag(
+ [
+ DXB.r_tag(
+ [DXB.t_tag('aaa')],
+ rpr=DXB.rpr_tag({'b': None}),
+ ),
+ ],
+ ),
+ DXB.p_tag(
+ [
+ DXB.r_tag(
+ [DXB.t_tag('bbb')],
+ rpr=DXB.rpr_tag({'u': None}),
+ ),
+ ],
+ ),
+ DXB.p_tag(
+ [
+ DXB.r_tag(
+ [DXB.t_tag('ccc')],
+ rpr=DXB.rpr_tag({'i': None}),
+ ),
+ ],
+ ),
+ DXB.p_tag(
+ [
+ DXB.r_tag(
+ [DXB.t_tag('ddd')],
+ rpr=DXB.rpr_tag({'caps': None}),
+ ),
+ ],
+ ),
+ DXB.p_tag(
+ [
+ DXB.r_tag(
+ [DXB.t_tag('eee')],
+ rpr=DXB.rpr_tag({'smallCaps': None}),
+ ),
+ ],
+ ),
+ DXB.p_tag(
+ [
+ DXB.r_tag(
+ [DXB.t_tag('fff')],
+ rpr=DXB.rpr_tag({'strike': None})
+ ),
+ ],
+ ),
+ DXB.p_tag(
+ [
+ DXB.r_tag(
+ [DXB.t_tag('ggg')],
+ rpr=DXB.rpr_tag({'dstrike': None}),
+ ),
+ ],
+ ),
+ DXB.p_tag(
+ [
+ DXB.r_tag(
+ [DXB.t_tag('hhh')],
+ rpr=DXB.rpr_tag({'vanish': None})
+ ),
+ ],
+ ),
+ DXB.p_tag(
+ [
+ DXB.r_tag(
+ [DXB.t_tag('iii')],
+ rpr=DXB.rpr_tag({'webHidden': None}),
+ ),
+ ],
+ ),
+ DXB.p_tag(
+ [
+ DXB.r_tag(
+ [DXB.t_tag('jjj')],
+ rpr=DXB.rpr_tag({'vertAlign': 'superscript'}),
+ ),
+ ],
+ ),
+ ]
+ body = ''
+ for p_tag in p_tags:
+ body += p_tag
+
+ xml = DXB.xml(body)
+ return xml
+
+
+class UnicodeTestCase(_TranslationTestCase):
+ expected_output = u"""
+ \U0010001f
+ """
+
+ def get_xml(self):
+ tags = [
+ DXB.p_tag(
+ [
+ DXB.r_tag(
+ [DXB.t_tag(r'')],
+ ),
+ ],
+ ),
+ ]
+
+ body = ''
+ for tag in tags:
+ body += tag
+ xml = DXB.xml(body)
+ return xml.encode('utf-8')
+
+
+class NoTextInTTagTestCase(_TranslationTestCase):
+ expected_output = u"""
+ """
+
+ def get_xml(self):
+ tags = [
+ DXB.p_tag(
+ [
+ DXB.r_tag(
+ [DXB.t_tag(None)],
+ ),
+ ],
+ ),
+ ]
+
+ body = ''
+ for tag in tags:
+ body += tag
+ xml = DXB.xml(body)
+ return xml.encode('utf-8')
diff --git a/pydocx/utils.py b/pydocx/utils.py
new file mode 100644
index 00000000..1323302b
--- /dev/null
+++ b/pydocx/utils.py
@@ -0,0 +1,495 @@
+import re
+import collections
+
+from collections import defaultdict
+from xml.etree import cElementTree
+
+from pydocx.exceptions import MalformedDocxException
+
+
+UPPER_ROMAN_TO_HEADING_VALUE = 'h2'
+TAGS_CONTAINING_CONTENT = (
+ 't',
+ 'pict',
+ 'drawing',
+ 'delText',
+ 'ins',
+)
+TAGS_HOLDING_CONTENT_TAGS = (
+ 'p',
+ 'tbl',
+ 'sdt',
+)
+
+
+class MulitMemoize(object):
+ '''
+ Adapted from: https://wiki.python.org/moin/PythonDecoratorLibrary#Memoize
+ func_names = {
+ 'find_all': find_all,
+ ...
+ }
+ '''
+ def __init__(self, func_names):
+ self.cache = dict((func_name, {}) for func_name in func_names)
+ self.func_names = func_names
+
+ def __call__(self, func_name, *args):
+ if not isinstance(args, collections.Hashable):
+ # uncacheable. a list, for instance.
+ # better to not cache than blow up.
+ return self.func_names[func_name](*args)
+ if args in self.cache[func_name]:
+ return self.cache[func_name][args]
+ else:
+ value = self.func_names[func_name](*args)
+ self.cache[func_name][args] = value
+ return value
+
+
+class MulitMemoizeMixin(object):
+ def __init__(self, *args, **kwargs):
+ super(MulitMemoizeMixin, self).__init__(*args, **kwargs)
+ self._memoization = None
+
+ def memod_tree_op(self, func_name, *args):
+ return self._memoization(func_name, *args)
+
+ def populate_memoization(self, func_names):
+ self._memoization = MulitMemoize(func_names)
+
+
+def el_iter(el):
+ """
+ Go through all elements
+ """
+ try:
+ return el.iter()
+ except AttributeError:
+ return el.findall('.//*')
+
+
+def find_first(el, tag):
+ """
+ Find the first occurrence of a tag beneath the current element.
+ """
+ return el.find('.//' + tag)
+
+
+def find_all(el, tag):
+ """
+ Find all occurrences of a tag
+ """
+ return el.findall('.//' + tag)
+
+
+def find_ancestor_with_tag(pre_processor, el, tag):
+ """
+ Find the first ancestor with that is a `tag`.
+ """
+ while pre_processor.parent(el) is not None:
+ el = pre_processor.parent(el)
+ if el.tag == tag:
+ return el
+ return None
+
+
+def has_descendant_with_tag(el, tag):
+ """
+ Determine if there is a child ahead in the element tree.
+ """
+ # Get child. stop at first child.
+ return True if find_first(el, tag) is not None else False
+
+
+def _filter_children(element, tags):
+ return [
+ el for el in element.getchildren()
+ if el.tag in tags
+ ]
+
+
+def remove_namespaces(document):
+ """
+ >>> exception_raised = False
+ >>> try:
+ ... remove_namespaces('junk')
+ ... except MalformedDocxException:
+ ... exception_raised = True
+ >>> assert exception_raised
+ """
+ encoding_regex = re.compile(
+ r'<\?xml.*encoding="(.+?)"',
+ re.DOTALL | re.MULTILINE,
+ )
+ encoding = 'us-ascii'
+ m = encoding_regex.match(document)
+ if m:
+ encoding = m.groups(0)[0]
+ try:
+ root = cElementTree.fromstring(document)
+ except SyntaxError:
+ raise MalformedDocxException('This document cannot be converted.')
+ for child in el_iter(root):
+ child.tag = child.tag.split("}")[1]
+ child.attrib = dict(
+ (k.split("}")[-1], v)
+ for k, v in child.attrib.items()
+ )
+ return cElementTree.tostring(root, encoding=encoding)
+
+
+def get_list_style(numbering_root, num_id, ilvl):
+ # This is needed on both the custom lxml parser and the pydocx parser. So
+ # make it a function.
+ ids = find_all(numbering_root, 'num')
+ for _id in ids:
+ if _id.attrib['numId'] != num_id:
+ continue
+ abstractid = _id.find('abstractNumId')
+ abstractid = abstractid.attrib['val']
+ style_information = find_all(
+ numbering_root,
+ 'abstractNum',
+ )
+ for info in style_information:
+ if info.attrib['abstractNumId'] == abstractid:
+ for i in el_iter(info):
+ if (
+ 'ilvl' in i.attrib and
+ i.attrib['ilvl'] != ilvl):
+ continue
+ if i.find('numFmt') is not None:
+ return i.find('numFmt').attrib['val']
+
+
+class NamespacedNumId(object):
+ def __init__(self, num_id, num_tables, *args, **kwargs):
+ self._num_id = num_id
+ self._num_tables = num_tables
+
+ def __unicode__(self, *args, **kwargs):
+ return '%s:%d' % (
+ self._num_id,
+ self._num_tables,
+ )
+
+ def __repr__(self, *args, **kwargs):
+ return self.__unicode__(*args, **kwargs)
+
+ def __eq__(self, other):
+ if other is None:
+ return False
+ return repr(self) == repr(other)
+
+ def __ne__(self, other):
+ if other is None:
+ return False
+ return repr(self) != repr(other)
+
+ @property
+ def num_id(self):
+ return self._num_id
+
+
+class PydocxPreProcessor(MulitMemoizeMixin):
+ def __init__(
+ self,
+ convert_root_level_upper_roman=False,
+ styles_dict=None,
+ numbering_root=None,
+ *args, **kwargs):
+ self.meta_data = defaultdict(dict)
+ self.convert_root_level_upper_roman = convert_root_level_upper_roman
+ self.styles_dict = styles_dict
+ self.numbering_root = numbering_root
+
+ def perform_pre_processing(self, root, *args, **kwargs):
+ self.populate_memoization({
+ 'find_first': find_first,
+ })
+ self._add_parent(root)
+ # If we don't have a numbering root there cannot be any lists.
+ if self.numbering_root is not None:
+ self._set_list_attributes(root)
+ self._set_table_attributes(root)
+ self._set_is_in_table(root)
+
+ body = find_first(root, 'body')
+ p_elements = [
+ child for child in find_all(body, 'p')
+ ]
+ list_elements = [
+ child for child in p_elements
+ if self.is_list_item(child)
+ ]
+ # Find the first and last li elements
+ num_ids = set([self.num_id(i) for i in list_elements])
+ ilvls = set([self.ilvl(i) for i in list_elements])
+ self._set_first_list_item(num_ids, ilvls, list_elements)
+ self._set_last_list_item(num_ids, list_elements)
+
+ self._set_headers(p_elements)
+ self._convert_upper_roman(body)
+ self._set_next(body)
+
+ def is_first_list_item(self, el):
+ return self.meta_data[el].get('is_first_list_item', False)
+
+ def is_last_list_item_in_root(self, el):
+ return self.meta_data[el].get('is_last_list_item_in_root', False)
+
+ def is_list_item(self, el):
+ return self.meta_data[el].get('is_list_item', False)
+
+ def num_id(self, el):
+ if not self.is_list_item(el):
+ return None
+ return self.meta_data[el].get('num_id')
+
+ def ilvl(self, el):
+ if not self.is_list_item(el):
+ return None
+ return self.meta_data[el].get('ilvl')
+
+ def heading_level(self, el):
+ return self.meta_data[el].get('heading_level')
+
+ def is_in_table(self, el):
+ return self.meta_data[el].get('is_in_table')
+
+ def row_index(self, el):
+ return self.meta_data[el].get('row_index')
+
+ def column_index(self, el):
+ return self.meta_data[el].get('column_index')
+
+ def vmerge_continue(self, el):
+ return self.meta_data[el].get('vmerge_continue')
+
+ def next(self, el):
+ if el not in self.meta_data:
+ return
+ return self.meta_data[el].get('next')
+
+ def previous(self, el):
+ if el not in self.meta_data:
+ return
+ return self.meta_data[el].get('previous')
+
+ def parent(self, el):
+ return self.meta_data[el].get('parent')
+
+ def _add_parent(self, el): # if a parent, make that an attribute
+ for child in el.getchildren():
+ self.meta_data[child]['parent'] = el
+ self._add_parent(child)
+
+ def _set_list_attributes(self, el):
+ list_elements = find_all(el, 'numId')
+ for li in list_elements:
+ parent = find_ancestor_with_tag(self, li, 'p')
+ # Deleted text in a list will have a numId but no ilvl.
+ if parent is None:
+ continue
+ parent_ilvl = self.memod_tree_op('find_first', parent, 'ilvl')
+ if parent_ilvl is None:
+ continue
+ self.meta_data[parent]['is_list_item'] = True
+ self.meta_data[parent]['num_id'] = self._generate_num_id(parent)
+ self.meta_data[parent]['ilvl'] = parent_ilvl.attrib['val']
+
+ def _generate_num_id(self, el):
+ '''
+ Fun fact: It is possible to have a list in the root, that holds a table
+ that holds a list and for both lists to have the same numId. When this
+ happens we should namespace the nested list with the number of tables
+ it is in to ensure it is considered a new list. Otherwise all sorts of
+ terrible html gets generated.
+ '''
+ num_id = find_first(el, 'numId').attrib['val']
+
+ # First, go up the parent until we get None and count the number of
+ # tables there are.
+ num_tables = 0
+ while self.parent(el) is not None:
+ if el.tag == 'tbl':
+ num_tables += 1
+ el = self.parent(el)
+ return NamespacedNumId(
+ num_id=num_id,
+ num_tables=num_tables,
+ )
+
+ def _set_first_list_item(self, num_ids, ilvls, list_elements):
+ # Lists are grouped by having the same `num_id` and `ilvl`. The first
+ # list item is the first list item found for each `num_id` and `ilvl`
+ # combination.
+ for num_id in num_ids:
+ for ilvl in ilvls:
+ filtered_list_elements = [
+ i for i in list_elements
+ if (
+ self.num_id(i) == num_id and
+ self.ilvl(i) == ilvl
+ )
+ ]
+ if not filtered_list_elements:
+ continue
+ first_el = filtered_list_elements[0]
+ self.meta_data[first_el]['is_first_list_item'] = True
+
+ def _set_last_list_item(self, num_ids, list_elements):
+ # Find last list elements. Only mark list tags as the last list tag if
+ # it is in the root of the document. This is only used to ensure that
+ # once a root level list is finished we do not roll in the rest of the
+ # non list elements into the first root level list.
+ for num_id in num_ids:
+ filtered_list_elements = [
+ i for i in list_elements
+ if self.num_id(i) == num_id
+ ]
+ if not filtered_list_elements:
+ continue
+ last_el = filtered_list_elements[-1]
+ self.meta_data[last_el]['is_last_list_item_in_root'] = True
+
+ def _set_table_attributes(self, el):
+ tables = find_all(el, 'tbl')
+ for table in tables:
+ rows = _filter_children(table, ['tr'])
+ if rows is None:
+ continue
+ for i, row in enumerate(rows):
+ tcs = _filter_children(row, ['tc'])
+ for j, child in enumerate(tcs):
+ self.meta_data[child]['row_index'] = i
+ self.meta_data[child]['column_index'] = j
+ v_merge = find_first(child, 'vMerge')
+ if (
+ v_merge is not None and
+ ('continue' == v_merge.get('val', '') or
+ v_merge.attrib == {})
+ ):
+ self.meta_data[child]['vmerge_continue'] = True
+
+ def _set_is_in_table(self, el):
+ paragraph_elements = find_all(el, 'p')
+ for p in paragraph_elements:
+ if find_ancestor_with_tag(self, p, 'tc') is not None:
+ self.meta_data[p]['is_in_table'] = True
+
+ def _set_headers(self, elements):
+ # These are the styles for headers and what the html tag should be if
+ # we have one.
+ headers = {
+ 'heading 1': 'h1',
+ 'heading 2': 'h2',
+ 'heading 3': 'h3',
+ 'heading 4': 'h4',
+ 'heading 5': 'h5',
+ 'heading 6': 'h6',
+ 'heading 7': 'h6',
+ 'heading 8': 'h6',
+ 'heading 9': 'h6',
+ 'heading 10': 'h6',
+ }
+ # Remove the rPr from the styles dict since all the styling will be
+ # down with the heading.
+ for style_id, styles in self.styles_dict.items():
+ if styles.get('style_name', '').lower() in headers:
+ if 'default_run_properties' in styles:
+ del styles['default_run_properties']
+
+ for element in elements:
+ # This element is using the default style which is not a heading.
+ p_style = find_first(element, 'pStyle')
+ if p_style is None:
+ continue
+ style = p_style.attrib.get('val', '')
+ metadata = self.styles_dict.get(style, {})
+ style_name = metadata.get('style_name')
+
+ # Check to see if this element is actually a header.
+ if style_name and style_name.lower() in headers:
+ # Set all the list item variables to false.
+ self.meta_data[element]['is_list_item'] = False
+ self.meta_data[element]['is_first_list_item'] = False
+ self.meta_data[element]['is_last_list_item_in_root'] = False
+ # Prime the heading_level
+ self.meta_data[element]['heading_level'] = headers[style_name.lower()] # noqa
+
+ def _convert_upper_roman(self, body):
+ if not self.convert_root_level_upper_roman:
+ return
+ first_root_list_items = [
+ # Only root level elements.
+ el for el in body.getchildren()
+ # And only first_list_items
+ if self.is_first_list_item(el)
+ ]
+ visited_num_ids = []
+ all_p_tags_in_body = find_all(body, 'p')
+ for root_list_item in first_root_list_items:
+ if self.num_id(root_list_item) in visited_num_ids:
+ continue
+ visited_num_ids.append(self.num_id(root_list_item))
+ lst_style = get_list_style(
+ self.numbering_root,
+ self.num_id(root_list_item).num_id,
+ self.ilvl(root_list_item),
+ )
+ if lst_style != 'upperRoman':
+ continue
+ ilvl = min(
+ self.ilvl(el) for el in all_p_tags_in_body
+ if self.num_id(el) == self.num_id(root_list_item)
+ )
+ root_upper_roman_list_items = [
+ el for el in all_p_tags_in_body
+ if self.num_id(el) == self.num_id(root_list_item) and
+ self.ilvl(el) == ilvl
+ ]
+ for list_item in root_upper_roman_list_items:
+ self.meta_data[list_item]['is_list_item'] = False
+ self.meta_data[list_item]['is_first_list_item'] = False
+ self.meta_data[list_item]['is_last_list_item_in_root'] = False # noqa
+
+ self.meta_data[list_item]['heading_level'] = UPPER_ROMAN_TO_HEADING_VALUE # noqa
+
+ def _set_next(self, body):
+ def _get_children_with_content(el):
+ # We only care about children if they have text in them.
+ children = []
+ for child in _filter_children(el, TAGS_HOLDING_CONTENT_TAGS):
+ _has_descendant_with_tag = any(
+ has_descendant_with_tag(child, tag) for
+ tag in TAGS_CONTAINING_CONTENT
+ )
+ if _has_descendant_with_tag:
+ children.append(child)
+ return children
+
+ def _assign_next(children):
+ # Populate the `next` attribute for all the child elements.
+ for i in range(len(children)):
+ try:
+ if children[i + 1] is not None:
+ self.meta_data[children[i]]['next'] = children[i + 1] # noqa
+ except IndexError:
+ pass
+ try:
+ if children[i - 1] is not None:
+ self.meta_data[children[i]]['previous'] = children[i - 1] # noqa
+ except IndexError:
+ pass
+ # Assign next for everything in the root.
+ _assign_next(_get_children_with_content(body))
+
+ # In addition set next for everything in table cells.
+ for tc in find_all(body, 'tc'):
+ _assign_next(_get_children_with_content(tc))
+
+
+def parse_xml_from_string(xml):
+ return cElementTree.fromstring(remove_namespaces(xml))
diff --git a/requirements.txt b/requirements.txt
index f9954ad0..77421ff8 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1 +1,4 @@
-beautifulsoup4>=4.1.0
+Jinja2>=2.0
+coverage==3.6
+nose==1.3.0
+flake8
diff --git a/run_tests.sh b/run_tests.sh
new file mode 100755
index 00000000..da46b811
--- /dev/null
+++ b/run_tests.sh
@@ -0,0 +1,4 @@
+#! /bin/sh
+
+nosetests --verbose --with-doctest --with-coverage --cover-package pydocx $@ &&
+find -name '*.py' | xargs flake8
diff --git a/setup.py b/setup.py
new file mode 100644
index 00000000..defe5013
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,62 @@
+#! /usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import os
+
+try:
+ from setuptools import setup, find_packages
+except ImportError:
+ from ez_setup import use_setuptools
+ use_setuptools()
+ from setuptools import setup, find_packages # noqa
+rel_file = lambda *args: os.path.join(
+ os.path.dirname(os.path.abspath(__file__)), *args)
+
+
+def get_file(filename):
+ with open(rel_file(filename)) as f:
+ return f.read()
+
+
+def get_description():
+ return get_file('README.rst') + get_file('CHANGELOG')
+
+setup(
+ name="PyDocX",
+ # Edit here and pydocx.__init__
+ version="0.3.13",
+ description="docx (OOXML) to html converter",
+ author="Jason Ward, Sam Portnow",
+ author_email="jason.louard.ward@gmail.com, samson91787@gmail.com",
+ url="http://github.com/CenterForOpenScience/pydocx",
+ platforms=["any"],
+ license="BSD",
+ packages=find_packages(),
+ package_data={
+ 'pydocx': [
+ 'tests/templates/*.xml',
+ ],
+ },
+ scripts=[],
+ zip_safe=False,
+ install_requires=[],
+ cmdclass={},
+ classifiers=[
+ "Development Status :: 3 - Alpha",
+ "Programming Language :: Python",
+ "Programming Language :: Python :: 2.6",
+ "Programming Language :: Python :: 2.7",
+ "Programming Language :: Python :: 2 :: Only",
+ "Intended Audience :: Developers",
+ "License :: OSI Approved :: BSD License",
+ "Operating System :: OS Independent",
+ "Topic :: Text Processing :: Markup :: HTML",
+ "Topic :: Text Processing :: Markup :: XML",
+ ],
+ long_description=get_description(),
+ entry_points={
+ 'console_scripts': [
+ 'pydocx = pydocx.__init__:main',
+ ],
+ },
+)