diff --git a/.gitignore b/.gitignore index ce7a7cef..5a57b80d 100644 --- a/.gitignore +++ b/.gitignore @@ -37,3 +37,8 @@ pip-log.txt nosetests.xml *.mo .idea + +test.html +testxml.html + +main.py diff --git a/.travis.yml b/.travis.yml index 6a5babb4..4251ba15 100644 --- a/.travis.yml +++ b/.travis.yml @@ -2,9 +2,13 @@ language: python python: - "2.6" - "2.7" -script: python main.py +script: ./run_tests.sh install: + - python setup.py -q install - pip install -r requirements.txt +env: + - TRAVIS_EXECUTE_PERFORMANCE=1 notifications: email: - jason.louard.ward@gmail.com + - samson91787@gmail.com diff --git a/AUTHORS b/AUTHORS new file mode 100644 index 00000000..81a14d38 --- /dev/null +++ b/AUTHORS @@ -0,0 +1,2 @@ +Sam Protnow +Jason Ward diff --git a/CHANGELOG b/CHANGELOG new file mode 100644 index 00000000..d40440c9 --- /dev/null +++ b/CHANGELOG @@ -0,0 +1,74 @@ + +Changelog +========= +* 0.3.13 + * Significant performance gains for documents with a large number of table + cells. + * Significant performance gains for large documents. +* 0.3.12 + * Added command line support to convert from docx to either html or + markdown. +* 0.3.11 + * The non breaking hyphen tag was not correctly being imported. This issue + has been fixed. +* 0.3.10 + * Found and optimized a fairly large performance issue with tables that had + large amounts of content within a single cell, which includes nested + tables. +* 0.3.9 + * We are now respecting the `` element. We are putting a space in + everywhere they happen. + * Each styling can have a default defined based on values in `styles.xml`. + These default styles can be overwritten using the `rPr` on the actual `r` + tag. These default styles defined in `styles.xml` are actually being + respected now. +* 0.3.8 + * If zipfile fails to open the passed in file, we are now raising a + `MalformedDocxException` instead of a `BadZipFIle`. +* 0.3.7 + * Some inline tags (most notably the underline tag) could have a `val` of + `none` and that would signify that the style is disabled. A `val` of + `none` is now correctly handled. +* 0.3.6 + * It is possible for a docx file to not contain a `numbering.xml` file but + still try to use lists. Now if this happens all lists get converted to + paragraphs. +* 0.3.5 + * Not all docx files contain a `styles.xml` file. We are no longer assuming + they do. +* 0.3.4 + * It is possible for `w:t` tags to have `text` set to `None`. This no + longer causes an error when escaping that text. +* 0.3.3 + * In the event that `cElementTree` has a problem parsing the document, a + `MalformedDocxException` is raised instead of a `SyntaxError` +* 0.3.2 + * We were not taking into account that vertical merges should have a + continue attribute, but sometimes they do not, and in those cases word + assumes the continue attribute. We updated the parser to handle the + cases in which the continue attribute is not there. + * We now correctly handle documents with unicode character in the + namespace. + * In rare cases, some text would be output with a style when it should not + have been. This issue has been fixed. +* 0.3.1 + * Added support for several more OOXML tags including: + * caps + * smallCaps + * strike + * dstrike + * vanish + * webHidden + More details in the README. +* 0.3.0 + * We switched from using stock *xml.etree.ElementTree* to using + *xml.etree.cElementTree*. This has resulted in a fairly significant speed + increase for python 2.6 + * It is now possible to create your own pre processor to do additional pre + processing. + * Superscripts and subscripts are now extracted correctly. +* 0.2.1 + * Added a changelog + * Added the version in pydocx.__init__ + * Fixed an issue with duplicating content if there was indentation or + justification on a p element that had multiple t tags. diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 00000000..88fbbf67 --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1,7 @@ +include AUTHORS +include CHANGELOG +include LICENSE +include MANIFEST.in +include README.rst +include pydocx/fixtures/* +include pydocx/tests/templates/* diff --git a/README.md b/README.md deleted file mode 100644 index e3773551..00000000 --- a/README.md +++ /dev/null @@ -1,2 +0,0 @@ -pydocx -====== \ No newline at end of file diff --git a/README.rst b/README.rst new file mode 100644 index 00000000..1bb9b3b1 --- /dev/null +++ b/README.rst @@ -0,0 +1,238 @@ +====== +pydocx +====== +.. image:: https://travis-ci.org/CenterForOpenScience/pydocx.png?branch=master + :align: left + :target: https://travis-ci.org/CenterForOpenScience/pydocx + +pydocx is a parser that breaks down the elements of a docxfile and converts them +into different markup languages. Right now, HTML is supported. Markdown and LaTex +will be available soon. You can extend any of the available parsers to customize it +to your needs. You can also create your own class that inherits DocxParser +to create your own methods for a markup language not yet supported. + +Currently Supported +################### + +* tables + * nested tables + * rowspans + * colspans + * lists in tables +* lists + * list styles + * nested lists + * list of tables + * list of pragraphs +* justification +* images +* styles + * bold + * italics + * underline + * hyperlinks +* headings + +Usage +##### + +DocxParser includes abstracts methods that each parser overwrites to satsify its own needs. The abstract methods are as follows: + +:: + + class DocxParser: + + @property + def parsed(self): + return self._parsed + + @property + def escape(self, text): + return text + + @abstractmethod + def linebreak(self): + return '' + + @abstractmethod + def paragraph(self, text): + return text + + @abstractmethod + def heading(self, text, heading_level): + return text + + @abstractmethod + def insertion(self, text, author, date): + return text + + @abstractmethod + def hyperlink(self, text, href): + return text + + @abstractmethod + def image_handler(self, path): + return path + + @abstractmethod + def image(self, path, x, y): + return self.image_handler(path) + + @abstractmethod + def deletion(self, text, author, date): + return text + + @abstractmethod + def bold(self, text): + return text + + @abstractmethod + def italics(self, text): + return text + + @abstractmethod + def underline(self, text): + return text + + @abstractmethod + def superscript(self, text): + return text + + @abstractmethod + def subscript(self, text): + return text + + @abstractmethod + def tab(self): + return True + + @abstractmethod + def ordered_list(self, text): + return text + + @abstractmethod + def unordered_list(self, text): + return text + + @abstractmethod + def list_element(self, text): + return text + + @abstractmethod + def table(self, text): + return text + @abstractmethod + def table_row(self, text): + return text + + @abstractmethod + def table_cell(self, text): + return text + + @abstractmethod + def page_break(self): + return True + + @abstractmethod + def indent(self, text, left='', right='', firstLine=''): + return text + +Docx2Html inherits DocxParser and implements basic HTML handling. Ex. + +:: + + class Docx2Html(DocxParser): + + # Escape '&', '<', and '>' so we render the HTML correctly + def escape(self, text): + return xml.sax.saxutils.quoteattr(text)[1:-1] + + # return a line break + def linebreak(self, pre=None): + return '
' + + # add paragraph tags + def paragraph(self, text, pre=None): + return '

' + text + '

' + + +However, let's say you want to add a specific style to your HTML document. In order to do this, you want to make each paragraph a class of type `my_implementation`. Simply extend docx2Html and add what you need. + +:: + + class My_Implementation_of_Docx2Html(Docx2Html): + + def paragraph(self, text, pre = None): + return

+ text + '

' + + + +OR, let's say FOO is your new favorite markup language. Simply customize your own new parser, overwritting the abstract methods of DocxParser + +:: + + class Docx2Foo(DocxParser): + + # because linebreaks in are denoted by '!!!!!!!!!!!!' with the FOO markup langauge :) + def linebreak(self): + return '!!!!!!!!!!!!' + +Custom Pre-Processor +#################### + +When creating your own Parser (as described above) you can now add in your own custom Pre Processor. To do so you will need to set the `pre_processor` field on the custom parser, like so: + +:: + + class Docx2Foo(DocxParser): + pre_processor_class = FooPreProcessor + + +The `FooPreProcessor` will need a few things to get you going: + +:: + + class FooPreProcessor(PydocxPreProcessor): + def perform_pre_processing(self, root, *args, **kwargs): + super(FooPreProcessor, self).perform_pre_processing(root, *args, **kwargs) + self._set_foo(root) + + def _set_foo(self, root): + pass + +If you want `_set_foo` to be called you must add it to `perform_pre_processing` which is called in the base parser for pydocx. + +Everything done during pre-processing is executed prior to `parse` being called for the first time. + + +Styles +###### + +The base parser `Docx2Html` relies on certain css class being set for certain behaviour to occur. Currently these include: + +* class `pydocx-insert` -> Turns the text green. +* class `pydocx-delete` -> Turns the text red and draws a line through the text. +* class `pydocx-center` -> Aligns the text to the center. +* class `pydocx-right` -> Aligns the text to the right. +* class `pydocx-left` -> Aligns the text to the left. +* class `pydocx-comment` -> Turns the text blue. +* class `pydocx-underline` -> Underlines the text. +* class `pydocx-caps` -> Makes all text uppercase. +* class `pydocx-small-caps` -> Makes all text uppercase, however truly lowercase letters will be small than their uppercase counterparts. +* class `pydocx-strike` -> Strike a line through. +* class `pydocx-hidden` -> Hide the text. + +Exceptions +########## + +Right now there is only one custom exception (`MalformedDocxException`). It is raised if either the `xml` or `zipfile` libraries raise an exception. + +Optional Arguments +################## + +You can pass in `convert_root_level_upper_roman=True` to the parser and it will convert all root level upper roman lists to headings instead. + +Command Line Execution +###################### + +First you have to install pydocx, this can be done by running the command `pip install pydocx`. From there you can simply call the command `pydocx --html path/to/file.docx path/to/output.html`. Change `pydocx --html` to `pydocx --markdown` in order to convert to markdown instead. diff --git a/main.py b/main.py deleted file mode 100644 index c9e8e1d4..00000000 --- a/main.py +++ /dev/null @@ -1,12 +0,0 @@ -from pydocx import * -from bs4 import BeautifulSoup -import xml.etree.ElementTree as ElementTree -#import lxml.etree as etree - -with open('test.html', 'w') as f: - f.write(docx2html('helloworld.docx')) -with open('testxml.html','w') as f: - f.write(BeautifulSoup(ElementTree.tostring(Docx2Html('helloworld.docx').root)).prettify()) - -#print docx2html('helloworld.docx') -#print docx2markdown('helloworld.docx') \ No newline at end of file diff --git a/pydocx/DocxParser.py b/pydocx/DocxParser.py index b3006ef0..fb08b180 100644 --- a/pydocx/DocxParser.py +++ b/pydocx/DocxParser.py @@ -1,323 +1,715 @@ -from abc import abstractmethod, ABCMeta -import zipfile import logging -import xml.etree.ElementTree as ElementTree -from xml.etree.ElementTree import _ElementInterface +import os +import zipfile + +from abc import abstractmethod, ABCMeta +from contextlib import contextmanager + +from pydocx.utils import ( + MulitMemoizeMixin, + PydocxPreProcessor, + find_all, + find_ancestor_with_tag, + find_first, + get_list_style, + has_descendant_with_tag, + parse_xml_from_string, +) +from pydocx.exceptions import MalformedDocxException logging.basicConfig(level=logging.DEBUG) logger = logging.getLogger("NewParser") -def remove_namespaces(document): - root = ElementTree.fromstring(document) - for child in el_iter(root): - child.tag = child.tag.split("}")[1] - child.attrib = dict( - (k.split("}")[1], v) - for k, v in child.attrib.items() - ) - return ElementTree.tostring(root) - -# Add some helper functions to Element to make it slightly more readable - - -def has_child(self, tag): - return True if self.find(tag) is not None else False - - -def has_child_all(self, tag): - return True if self.find('.//' + tag) is not None else False +# http://openxmldeveloper.org/discussions/formats/f/15/p/396/933.aspx +EMUS_PER_PIXEL = 9525 +USE_ALIGNMENTS = True +JUSTIFY_CENTER = 'center' +JUSTIFY_LEFT = 'left' +JUSTIFY_RIGHT = 'right' -def find_all(self, tag): - return self.find('.//' + tag) +INDENTATION_RIGHT = 'right' +INDENTATION_LEFT = 'left' +INDENTATION_FIRST_LINE = 'firstLine' +DISABLED_STYLE_VALUES = ['false', '0', 'none'] - -def findall_all(self, tag): - return self.findall('.//' + tag) +# Add some helper functions to Element to make it slightly more readable -def el_iter(el): +@contextmanager +def ZipFile(path): # This is not needed in python 3.2+ try: - return el.iter() - except AttributeError: - return el.findall('.//*') - - -setattr(_ElementInterface, 'has_child', has_child) -setattr(_ElementInterface, 'has_child_all', has_child_all) -setattr(_ElementInterface, 'find_all', find_all) -setattr(_ElementInterface, 'findall_all', findall_all) -setattr(_ElementInterface, 'parent', None) -setattr(_ElementInterface, 'parent_list', []) - -# End helpers + f = zipfile.ZipFile(path) + except zipfile.BadZipfile: + raise MalformedDocxException('Passed in document is not a docx') + yield f + f.close() -class DocxParser: +class DocxParser(MulitMemoizeMixin): __metaclass__ = ABCMeta + pre_processor_class = PydocxPreProcessor - def __init__(self, path): - self._parsed = '' - self.in_list = False - - f = zipfile.ZipFile(path) + def _extract_xml(self, f, xml_path): try: - self.document_text = f.read('word/document.xml') - try: - self.numbering_text = f.read('word/numbering.xml') - except zipfile.BadZipfile: - pass - try: - self.comment_text = f.read('word/comments.xml') - except zipfile.BadZipfile: - pass - finally: - f.close() - - self.root = ElementTree.fromstring( - remove_namespaces(self.document_text), - ) - - def add_parent(el): - for child in el.getchildren(): - setattr(child, 'parent', el) - add_parent(child) - add_parent(self.root) - - def create_parent_list(el, tmp=None): - if tmp is None: - tmp = [] - for child in el: - tmp.append(el) - tmp = create_parent_list(child, tmp) - el.parent_list = tmp[:] - try: - tmp.pop() - except: - tmp = [] - return tmp - - create_parent_list(self.root) + return f.read(xml_path) + except KeyError: + return None + def _build_data(self, path, *args, **kwargs): + with ZipFile(path) as f: + # These must be in the ZIP in order for the docx to be valid. + self.document_text = f.read('word/document.xml') + self.relationship_text = f.read('word/_rels/document.xml.rels') + + # These are all optional. + self.styles_text = self._extract_xml(f, 'word/styles.xml') + self.fonts = self._extract_xml(f, 'word/fontTable.xml') + self.numbering_text = self._extract_xml(f, 'word/numbering.xml') + self.comment_text = self._extract_xml(f, 'word/comments.xml') + + zipped_image_files = [ + e for e in f.infolist() + if e.filename.startswith('word/media/') + ] + for e in zipped_image_files: + self._image_data[e.filename] = f.read(e.filename) + + self.root = parse_xml_from_string(self.document_text) + self.numbering_root = None + if self.numbering_text: + self.numbering_root = parse_xml_from_string(self.numbering_text) + self.comment_root = None + if self.comment_text: + self.comment_root = parse_xml_from_string(self.comment_text) + + def _parse_run_properties(self, rPr): + """ + Takes an `rPr` and returns a dictionary contain the tag name mapped to + the child's value property. + + If you have an rPr that looks like this: + + + + + + + That will result in a dictionary that looks like this: + { + 'b': '', + 'u': 'false', + 'sz': '16', + } + """ + run_properties = {} + if rPr is None: + return {} + for run_property in rPr: + val = run_property.get('val', '').lower() + run_properties[run_property.tag] = val + return run_properties + + def _parse_styles(self): + if self.styles_text is None: + return {} + tree = parse_xml_from_string(self.styles_text) + styles_dict = {} + for style in find_all(tree, 'style'): + style_val = find_first(style, 'name').attrib['val'] + run_properties = find_first(style, 'rPr') + styles_dict[style.attrib['styleId']] = { + 'style_name': style_val, + 'default_run_properties': self._parse_run_properties( + run_properties, + ), + } + return styles_dict + + def _parse_rels_root(self): + tree = parse_xml_from_string(self.relationship_text) + rels_dict = {} + for el in tree: + rId = el.get('Id') + target = el.get('Target') + rels_dict[rId] = target + return rels_dict + + def __init__( + self, + path, + convert_root_level_upper_roman=False, + *args, + **kwargs): + self._parsed = '' + self.block_text = '' + self.page_width = 0 + self.convert_root_level_upper_roman = convert_root_level_upper_roman + self._image_data = {} + self._build_data(path, *args, **kwargs) + self.pre_processor = None + + #divide by 20 to get to pt (Office works in 20th's of a point) + """ + see http://msdn.microsoft.com/en-us/library/documentformat + .openxml.wordprocessing.indentation.aspx + """ + if find_first(self.root, 'pgSz') is not None: + self.page_width = int( + find_first(self.root, 'pgSz').attrib['w'] + ) / 20 + + #all blank when we init self.comment_store = None - self.numbering_store = None - self.ignore_current = False - self.elements = [] - self.tables_seen = [] - self.visited = [] - try: - self.numbering_root = ElementTree.fromstring( - remove_namespaces(self.numbering_text), - ) - except: - pass - self.parse_begin(self.root) + self.visited = set() + self.list_depth = 0 + self.rels_dict = self._parse_rels_root() + self.styles_dict = self._parse_styles() + self.parse_begin(self.root) # begin to parse def parse_begin(self, el): - self._parsed += self.parse_lists(el) - -### parse table function and is_table flag - def parse_lists(self, el): - parsed = '' - first_p = el.find_all('p') - children = [] - for child in first_p.parent: - if child.tag == 'p' or child.tag == 'tbl': - children.append(child) - p_list = children - list_started = False - list_type = '' - list_chunks = [] - index_start = 0 - index_end = 1 - for i, el in enumerate(p_list): - if not list_started and el.has_child_all('ilvl'): - list_started = True - list_type = self.get_list_style( - el.find_all('numId').attrib['val'], - ) - list_chunks.append(p_list[index_start:index_end]) - index_start = i - index_end = i+1 - elif ( - list_started and - el.has_child_all('ilvl') and - not list_type == self.get_list_style( - el.find_all('numId').attrib['val'] - )): - list_type = self.get_list_style( - el.find_all('numId').attrib['val'], - ) - list_started = True - list_chunks.append(p_list[index_start:index_end]) - index_start = i - index_end = i+1 - elif list_started and not el.has_child_all('ilvl'): - list_started = False - list_chunks.append(p_list[index_start:index_end]) - index_start = i - index_end = i+1 - else: - index_end = i+1 - list_chunks.append(p_list[index_start:index_end]) - for chunk in list_chunks: - chunk_parsed = '' - for el in chunk: - chunk_parsed += self.parse(el) - if chunk[0].has_child_all('ilvl'): - lst_style = self.get_list_style( - chunk[0].find_all('numId').attrib['val'], - ) - if lst_style['val'] == 'bullet': - parsed += self.unordered_list(chunk_parsed) - else: - parsed += self.ordered_list(chunk_parsed) - elif chunk[0].has_child_all('br'): - parsed += self.page_break() - else: - parsed += chunk_parsed - - return parsed + self.populate_memoization({ + 'find_all': find_all, + 'find_first': find_first, + 'has_descendant_with_tag': has_descendant_with_tag, + '_get_tcs_in_column': self._get_tcs_in_column, + }) + self.pre_processor = self.pre_processor_class( + convert_root_level_upper_roman=self.convert_root_level_upper_roman, + styles_dict=self.styles_dict, + numbering_root=self.numbering_root, + ) + self.pre_processor.perform_pre_processing(el) + self._parsed += self.parse(el) def parse(self, el): + if el in self.visited: + return '' + self.visited.add(el) parsed = '' - if not self.ignore_current: - tmp_d = dict( - (tmpel.tag, i) - for i, tmpel in enumerate(el.parent_list) - ) - if ( - 'tbl' in tmp_d and - el.parent_list[tmp_d['tbl']] not in self.tables_seen): - self.ignore_current = True - self.tables_seen.append(el.parent_list[tmp_d['tbl']]) - tmpout = self.table(self.parse(el.parent_list[tmp_d['tbl']])) - self.ignore_current = False - return tmpout - for child in el: + # recursive. So you can get all the way to the bottom parsed += self.parse(child) - - if el.tag == 'br' and el.attrib['type'] == 'page': - #TODO figure out what parsed is getting overwritten - return self.page_break() - # add it to the list so we don't repeat! - if el.tag == 'ilvl' and el not in self.visited: - self.in_list = True - self.visited.append(el) - ## This starts the returns + if el.tag == 'br' and el.attrib.get('type') == 'page': + return self.parse_page_break(el, parsed) + # page breaks use lastRenderedPageBreak in MS Word > 2007 + elif el.tag == 'lastRenderedPageBreak': + return self.parse_page_break(el, parsed) + elif el.tag == 'tbl': + return self.parse_table(el, parsed) elif el.tag == 'tr': - return self.table_row(parsed) + return self.parse_table_row(el, parsed) elif el.tag == 'tc': - self.elements.append(el) - return self.table_cell(parsed) - if el.tag == 'r' and el not in self.elements: - self.elements.append(el) - return self.parse_r(el) + return self.parse_table_cell(el, parsed) + elif el.tag == 'r': + return self.parse_r(el, parsed) + elif el.tag == 't': + return self.parse_t(el, parsed) + elif el.tag == 'tab': + return self.parse_tab(el, parsed) + elif el.tag == 'noBreakHyphen': + return self.parse_hyphen(el, parsed) + elif el.tag == 'br': + return self.parse_break_tag(el, parsed) + elif el.tag == 'delText': + return self.parse_deletion(el, parsed) elif el.tag == 'p': return self.parse_p(el, parsed) elif el.tag == 'ins': - return self.insertion(parsed, '', '') + return self.parse_insertion(el, parsed) + elif el.tag == 'hyperlink': + return self.parse_hyperlink(el, parsed) + elif el.tag in ('pict', 'drawing'): + return self.parse_image(el) + else: + return parsed + + def parse_page_break(self, el, text): + #TODO figure out what parsed is getting overwritten + return self.page_break() + + def parse_table(self, el, text): + return self.table(text) + + def parse_table_row(self, el, text): + return self.table_row(text) + + def parse_table_cell(self, el, text): + v_merge = find_first(el, 'vMerge') + if v_merge is not None and ( + 'restart' != v_merge.get('val', '')): + return '' + colspan = self.get_colspan(el) + rowspan = self._get_rowspan(el, v_merge) + if rowspan > 1: + rowspan = str(rowspan) + else: + rowspan = '' + return self.table_cell(text, colspan, rowspan) + + def parse_list(self, el, text): + """ + All the meat of building the list is done in _parse_list, however we + call this method for two reasons: It is the naming convention we are + following. And we need a reliable way to raise and lower the list_depth + (which is used to determine if we are in a list). I could have done + this in _parse_list, however it seemed cleaner to do it here. + """ + self.list_depth += 1 + parsed = self._parse_list(el, text) + self.list_depth -= 1 + if self.pre_processor.is_in_table(el): + return self.parse_table_cell_contents(el, parsed) + return parsed + + def get_list_style(self, num_id, ilvl): + return get_list_style(self.numbering_root, num_id, ilvl) + + def _build_list(self, el, text): + # Get the list style for the pending list. + lst_style = self.get_list_style( + self.pre_processor.num_id(el).num_id, + self.pre_processor.ilvl(el), + ) + + parsed = text + # Create the actual list and return it. + if lst_style == 'bullet': + return self.unordered_list(parsed) else: + return self.ordered_list( + parsed, + lst_style, + ) + + def _parse_list(self, el, text): + parsed = self.parse_list_item(el, text) + num_id = self.pre_processor.num_id(el) + ilvl = self.pre_processor.ilvl(el) + # Everything after this point assumes the first element is not also the + # last. If the first element is also the last then early return by + # building and returning the completed list. + if self.pre_processor.is_last_list_item_in_root(el): + return self._build_list(el, parsed) + next_el = self.pre_processor.next(el) + + def is_same_list(next_el, num_id, ilvl): + # Bail if next_el is not an element + if next_el is None: + return False + if self.pre_processor.is_last_list_item_in_root(next_el): + return False + # If next_el is not a list item then roll it into the list by + # returning True. + if not self.pre_processor.is_list_item(next_el): + return True + if self.pre_processor.num_id(next_el) != num_id: + # The next element is a new list entirely + return False + if self.pre_processor.ilvl(next_el) < ilvl: + # The next element is de-indented, so this is really the last + # element in the list + return False + return True + + while is_same_list(next_el, num_id, ilvl): + if next_el in self.visited: + # Early continue for elements we have already visited. + next_el = self.pre_processor.next(next_el) + continue + + if self.pre_processor.is_list_item(next_el): + # Reset the ilvl + ilvl = self.pre_processor.ilvl(next_el) + + parsed += self.parse(next_el) + next_el = self.pre_processor.next(next_el) + + def should_parse_last_el(last_el, first_el): + if last_el is None: + return False + # Different list + if ( + self.pre_processor.num_id(last_el) != + self.pre_processor.num_id(first_el)): + return False + # Will be handled when the ilvls do match (nesting issue) + if ( + self.pre_processor.ilvl(last_el) != + self.pre_processor.ilvl(first_el)): + return False + # We only care about last items that have not been parsed before + # (first list items are always parsed at the beginning of this + # method.) + return ( + not self.pre_processor.is_first_list_item(last_el) and + self.pre_processor.is_last_list_item_in_root(last_el) + ) + if should_parse_last_el(next_el, el): + parsed += self.parse(next_el) + + # If the list has no content, then we don't need to worry about the + # list styling, because it will be stripped out. + if parsed == '': return parsed + return self._build_list(el, parsed) + + def justification(self, el, text): + paragraph_tag_property = el.find('pPr') + if paragraph_tag_property is None: + return text + + _justification = paragraph_tag_property.find('jc') + indentation = paragraph_tag_property.find('ind') + if _justification is None and indentation is None: + return text + alignment = None + right = None + left = None + firstLine = None + if _justification is not None: # text alignments + value = _justification.attrib['val'] + if value in [JUSTIFY_LEFT, JUSTIFY_CENTER, JUSTIFY_RIGHT]: + alignment = value + + if indentation is not None: + if INDENTATION_RIGHT in indentation.attrib: + right = indentation.attrib[INDENTATION_RIGHT] + # divide by 20 to get to pt. multiply by (4/3) to get to px + right = (int(right) / 20) * float(4) / float(3) + right = str(right) + if INDENTATION_LEFT in indentation.attrib: + left = indentation.attrib[INDENTATION_LEFT] + left = (int(left) / 20) * float(4) / float(3) + left = str(left) + if INDENTATION_FIRST_LINE in indentation.attrib: + firstLine = indentation.attrib[INDENTATION_FIRST_LINE] + firstLine = (int(firstLine) / 20) * float(4) / float(3) + firstLine = str(firstLine) + if any([alignment, firstLine, left, right]): + return self.indent(text, alignment, firstLine, left, right) + return text + def parse_p(self, el, text): + if text == '': + return '' + # TODO This is still not correct, however it fixes the bug. We need to + # apply the classes/styles on p, td, li and h tags instead of inline, + # but that is for another ticket. + text = self.justification(el, text) + if self.pre_processor.is_first_list_item(el): + return self.parse_list(el, text) + if self.pre_processor.heading_level(el): + return self.parse_heading(el, text) + if self.pre_processor.is_list_item(el): + return self.parse_list_item(el, text) + if self.pre_processor.is_in_table(el): + return self.parse_table_cell_contents(el, text) parsed = text - if self.in_list: - self.in_list = False - parsed = self.list_element(parsed) - elif ( - not el.has_child_all('t') and - 'tbl' not in [i.tag for i in el.parent_list]): - parsed = self.linebreak() - elif el.parent not in self.elements: + # No p tags in li tags + if self.list_depth == 0: parsed = self.paragraph(parsed) return parsed - def parse_r(self, el): - is_deleted = False - text = None - if el.has_child('t'): - text = self.escape(el.find('t').text) - elif el.has_child('delText'): - text = self.escape(el.find('delText').text) - is_deleted = True - if text: - rpr = el.find('rPr') - if rpr is not None: - fns = [] - if rpr.has_child('b'): - fns.append(self.bold) - if rpr.has_child('i'): - fns.append(self.italics) - if rpr.has_child('u'): - fns.append(self.underline) - for fn in fns: - text = fn(text) - ppr = el.parent.find('pPr') - if ppr is not None: - jc = ppr.find('jc') - if jc is not None: - if jc.attrib['val'] == 'right': - text = self.right_justify(text) - if jc.attrib['val'] == 'center': - text = self.center_justify(text) - ind = ppr.find('ind') - if ind is not None: - right = None - left = None - firstLine = None - if 'right' in ind.attrib: - right = ind.attrib['right'] - right = int(right)/20 - right = str(right) - if 'left' in ind.attrib: - left = ind.attrib['left'] - left = int(left)/20 - left = str(left) - if 'firstLine' in ind.attrib: - firstLine = ind.attrib['firstLine'] - firstLine = int(firstLine)/20 - firstLine = str(firstLine) - text = self.indent(text, right, left, firstLine) - if is_deleted: - text = self.deletion(text, '', '') + def _should_append_break_tag(self, next_el): + paragraph_like_tags = [ + 'p', + ] + inline_like_tags = [ + 'smartTag', + 'ins', + 'delText', + ] + if self.pre_processor.is_list_item(next_el): + return False + if self.pre_processor.previous(next_el) is None: + return False + tag_is_inline_like = any( + self.memod_tree_op('has_descendant_with_tag', next_el, tag) for + tag in inline_like_tags + ) + if tag_is_inline_like: + return False + if ( + self.pre_processor.is_last_list_item_in_root( + self.pre_processor.previous(next_el))): + return False + if self.pre_processor.previous(next_el).tag not in paragraph_like_tags: + return False + if next_el.tag not in paragraph_like_tags: + return False + return True + + def parse_heading(self, el, parsed): + return self.heading(parsed, self.pre_processor.heading_level(el)) + + def parse_list_item(self, el, text): + # If for whatever reason we are not currently in a list, then start + # a list here. This will only happen if the num_id/ilvl combinations + # between lists is not well formed. + parsed = text + if self.list_depth == 0: + return self.parse_list(el, parsed) + + def _should_parse_next_as_content(el): + """ + Get the contents of the next el and append it to the + contents of the current el (that way things like tables + are actually in the li tag instead of in the ol/ul tag). + """ + next_el = self.pre_processor.next(el) + if next_el is None: + return False + if ( + not self.pre_processor.is_list_item(next_el) and + not self.pre_processor.is_last_list_item_in_root(el) + ): + return True + if self.pre_processor.is_first_list_item(next_el): + if ( + self.pre_processor.num_id(next_el) == + self.pre_processor.num_id(el)): + return True + return False + + while el is not None: + if _should_parse_next_as_content(el): + el = self.pre_processor.next(el) + next_elements_content = self.parse(el) + if not next_elements_content: + continue + if self._should_append_break_tag(el): + parsed += self.break_tag() + parsed += next_elements_content + else: + break + # Create the actual li element + return self.list_element(parsed) + + def _get_tcs_in_column(self, tbl, column_index): + return [ + tc for tc in self.memod_tree_op('find_all', tbl, 'tc') + if self.pre_processor.column_index(tc) == column_index + ] + + def _get_rowspan(self, el, v_merge): + restart_in_v_merge = False + if v_merge is not None and 'val' in v_merge.attrib: + restart_in_v_merge = 'restart' in v_merge.attrib['val'] + + if not restart_in_v_merge: + return '' + + current_row = self.pre_processor.row_index(el) + current_col = self.pre_processor.column_index(el) + rowspan = 1 + result = '' + tbl = find_ancestor_with_tag(self.pre_processor, el, 'tbl') + # We only want table cells that have a higher row_index that is greater + # than the current_row and that are on the current_col + if tbl is None: + return '' + + tcs = [ + tc for tc in self.memod_tree_op( + '_get_tcs_in_column', tbl, current_col, + ) if self.pre_processor.row_index(tc) >= current_row + ] + + def should_increment_rowspan(tc): + if not self.pre_processor.vmerge_continue(tc): + return False + return True + + for tc in tcs: + if should_increment_rowspan(tc): + rowspan += 1 + else: + rowspan = 1 + if rowspan > 1: + result = rowspan + return str(result) + + def get_colspan(self, el): + grid_span = find_first(el, 'gridSpan') + if grid_span is None: + return '' + return grid_span.attrib['val'] + + def parse_table_cell_contents(self, el, text): + parsed = text + + next_el = self.pre_processor.next(el) + if next_el is not None: + if self._should_append_break_tag(next_el): + parsed += self.break_tag() + return parsed + + def parse_hyperlink(self, el, text): + rId = el.get('id') + href = self.rels_dict.get(rId) + if not href: return text - else: + href = self.escape(href) + return self.hyperlink(text, href) + + def _get_image_id(self, el): + # Drawings + blip = find_first(el, 'blip') + if blip is not None: + # On drawing tags the id is actually whatever is returned from the + # embed attribute on the blip tag. Thanks a lot Microsoft. + return blip.get('embed') + # Picts + imagedata = find_first(el, 'imagedata') + if imagedata is not None: + return imagedata.get('id') + + def _convert_image_size(self, size): + return size / EMUS_PER_PIXEL + + def _get_image_size(self, el): + """ + If we can't find a height or width, return 0 for whichever is not + found, then rely on the `image` handler to strip those attributes. This + functionality can change once we integrate PIL. + """ + sizes = find_first(el, 'ext') + if sizes is not None and sizes.get('cx'): + if sizes.get('cx'): + x = self._convert_image_size(int(sizes.get('cx'))) + if sizes.get('cy'): + y = self._convert_image_size(int(sizes.get('cy'))) + return ( + '%dpx' % x, + '%dpx' % y, + ) + shape = find_first(el, 'shape') + if shape is not None and shape.get('style') is not None: + # If either of these are not set, rely on the method `image` to not + # use either of them. + x = 0 + y = 0 + styles = shape.get('style').split(';') + + for s in styles: + if s.startswith('height:'): + y = s.split(':')[1] + if s.startswith('width:'): + x = s.split(':')[1] + return x, y + return 0, 0 + + def parse_image(self, el): + x, y = self._get_image_size(el) + rId = self._get_image_id(el) + src = self.rels_dict.get(rId) + if not src: + return '' + src = os.path.join( + 'word', + src, + ) + if src in self._image_data: + filename = os.path.split(src)[-1] + return self.image(self._image_data[src], filename, x, y) + return '' + + def _is_style_on(self, value): + """ + For b, i, u (bold, italics, and underline) merely having the tag is not + sufficient. You need to check to make sure it is not set to "false" as + well. + """ + return value not in DISABLED_STYLE_VALUES + + def parse_t(self, el, parsed): + if el.text is None: + return '' + return self.escape(el.text) + + def parse_tab(self, el, parsed): + return self.tab() + + def parse_hyphen(self, el, parsed): + return '-' + + def parse_break_tag(self, el, parsed): + return self.break_tag() + + def parse_deletion(self, el, parsed): + if el.text is None: return '' + return self.deletion(el.text, '', '') - def get_list_style(self, numval): - ids = self.numbering_root.findall_all('num') - for _id in ids: - if _id.attrib['numId'] == numval: - abstractid = _id.find('abstractNumId') - abstractid = abstractid.attrib['val'] - style_information = self.numbering_root.findall_all( - 'abstractNum', - ) - for info in style_information: - if info.attrib['abstractNumId'] == abstractid: - for i in el_iter(info): - if i.find('numFmt') is not None: - return i.find('numFmt').attrib - - def get_comments(self, doc_id): - if self.comment_store is None: - # TODO throw appropriate error - comment_root = ElementTree.fromstring( - remove_namespaces(self.comment_text), + def parse_insertion(self, el, parsed): + return self.insertion(parsed, '', '') + + def parse_r(self, el, parsed): + """ + Parse the running text. + """ + text = parsed + if not text: + return '' + + run_properties = {} + + # Get the rPr for the current style, they are the defaults. + p = find_ancestor_with_tag(self.pre_processor, el, 'p') + paragraph_style = self.memod_tree_op('find_first', p, 'pStyle') + if paragraph_style is not None: + style = paragraph_style.get('val') + style_defaults = self.styles_dict.get(style, {}) + run_properties.update( + style_defaults.get('default_run_properties', {}), ) - ids_and_info = {} - ids = comment_root.findall_all('comment') - for _id in ids: - ids_and_info[_id.attrib['id']] = { - "author": _id.attrib['author'], - "date": _id.attrib['date'], - "text": _id.findall_all('t')[0].text, - } - self.comment_store = ids_and_info - return self.comment_store[doc_id] + + # Get the rPr for the current r tag, they are overrides. + run_properties_element = el.find('rPr') + if run_properties_element: + local_run_properties = self._parse_run_properties( + run_properties_element, + ) + run_properties.update(local_run_properties) + + inline_tag_handlers = { + 'b': self.bold, + 'i': self.italics, + 'u': self.underline, + 'caps': self.caps, + 'smallCaps': self.small_caps, + 'strike': self.strike, + 'dstrike': self.strike, + 'vanish': self.hide, + 'webHidden': self.hide, + } + styles_needing_application = [] + for property_name, property_value in run_properties.items(): + # These tags are a little different, handle them separately + # from the rest. + # This could be a superscript or a subscript + if property_name == 'vertAlign': + if property_value == 'superscript': + styles_needing_application.append(self.superscript) + elif property_value == 'subscript': + styles_needing_application.append(self.subscript) + else: + if ( + property_name in inline_tag_handlers and + self._is_style_on(property_value) + ): + styles_needing_application.append( + inline_tag_handlers[property_name], + ) + + # Apply all the handlers. + for func in styles_needing_application: + text = func(text) + + return text @property def parsed(self): @@ -335,10 +727,26 @@ def linebreak(self): def paragraph(self, text): return text + @abstractmethod + def heading(self, text, heading_level): + return text + @abstractmethod def insertion(self, text, author, date): return text + @abstractmethod + def hyperlink(self, text, href): + return text + + @abstractmethod + def image_handler(self, path): + return path + + @abstractmethod + def image(self, data, filename, x, y): + return self.image_handler(data) + @abstractmethod def deletion(self, text, author, date): return text @@ -355,6 +763,30 @@ def italics(self, text): def underline(self, text): return text + @abstractmethod + def caps(self, text): + return text + + @abstractmethod + def small_caps(self, text): + return text + + @abstractmethod + def strike(self, text): + return text + + @abstractmethod + def hide(self, text): + return text + + @abstractmethod + def superscript(self, text): + return text + + @abstractmethod + def subscript(self, text): + return text + @abstractmethod def tab(self): return True @@ -388,15 +820,5 @@ def page_break(self): return True @abstractmethod - def right_justify(self, text): - return text - - @abstractmethod - def center_justify(self, text): - return text - - @abstractmethod - def indent(self, text, left=None, right=None, firstLine=None): - return text - - #TODO JUSTIFIED JUSTIFIED TEXT + def indent(self, text, left='', right='', firstLine=''): + return text # TODO JUSTIFIED JUSTIFIED TEXT diff --git a/pydocx/__init__.py b/pydocx/__init__.py index 9b42e00f..e59babb6 100644 --- a/pydocx/__init__.py +++ b/pydocx/__init__.py @@ -1,8 +1,34 @@ -from .parsers import * +import sys +from .parsers import Docx2Html, Docx2Markdown + def docx2html(path): return Docx2Html(path).parsed + def docx2markdown(path): return Docx2Markdown(path).parsed +VERSION = '0.3.13' + + +def main(): + try: + parser_to_use = sys.argv[1] + path_to_docx = sys.argv[2] + path_to_html = sys.argv[3] + except IndexError: + print 'Must specify which parser as well as the file to convert and the name of the resulting file.' # noqa + sys.exit() + if parser_to_use == '--html': + html = Docx2Html(path_to_docx).parsed + elif parser_to_use == '--markdown': + html = Docx2Markdown(path_to_docx).parsed + else: + print 'Only valid parsers are --html and --markdown' + sys.exit() + with open(path_to_html, 'w') as f: + f.write(html.encode('utf-8')) + +if __name__ == '__main__': + main() diff --git a/pydocx/exceptions.py b/pydocx/exceptions.py new file mode 100644 index 00000000..cdff556a --- /dev/null +++ b/pydocx/exceptions.py @@ -0,0 +1,2 @@ +class MalformedDocxException(Exception): + pass diff --git a/pydocx/fixtures/all_configured_styles.docx b/pydocx/fixtures/all_configured_styles.docx new file mode 100644 index 00000000..8f514372 Binary files /dev/null and b/pydocx/fixtures/all_configured_styles.docx differ diff --git a/pydocx/fixtures/attachment_is_tiff.docx b/pydocx/fixtures/attachment_is_tiff.docx new file mode 100644 index 00000000..774362ca Binary files /dev/null and b/pydocx/fixtures/attachment_is_tiff.docx differ diff --git a/pydocx/fixtures/bigger_font_size_to_header.docx b/pydocx/fixtures/bigger_font_size_to_header.docx new file mode 100644 index 00000000..c722888b Binary files /dev/null and b/pydocx/fixtures/bigger_font_size_to_header.docx differ diff --git a/pydocx/fixtures/convert_p_to_h.docx b/pydocx/fixtures/convert_p_to_h.docx new file mode 100644 index 00000000..53769e15 Binary files /dev/null and b/pydocx/fixtures/convert_p_to_h.docx differ diff --git a/pydocx/fixtures/fake_headings_by_length.docx b/pydocx/fixtures/fake_headings_by_length.docx new file mode 100644 index 00000000..a130f5ba Binary files /dev/null and b/pydocx/fixtures/fake_headings_by_length.docx differ diff --git a/pydocx/fixtures/greek_alphabet.docx b/pydocx/fixtures/greek_alphabet.docx new file mode 100644 index 00000000..46ab5429 Binary files /dev/null and b/pydocx/fixtures/greek_alphabet.docx differ diff --git a/pydocx/fixtures/has_image.docx b/pydocx/fixtures/has_image.docx new file mode 100644 index 00000000..2ebd0bd0 Binary files /dev/null and b/pydocx/fixtures/has_image.docx differ diff --git a/pydocx/fixtures/has_missing_image.docx b/pydocx/fixtures/has_missing_image.docx new file mode 100644 index 00000000..996e6671 Binary files /dev/null and b/pydocx/fixtures/has_missing_image.docx differ diff --git a/pydocx/fixtures/has_title.docx b/pydocx/fixtures/has_title.docx new file mode 100644 index 00000000..a87d88ed Binary files /dev/null and b/pydocx/fixtures/has_title.docx differ diff --git a/pydocx/fixtures/header_footer_problem.docx b/pydocx/fixtures/header_footer_problem.docx new file mode 100644 index 00000000..6bc49a7a Binary files /dev/null and b/pydocx/fixtures/header_footer_problem.docx differ diff --git a/pydocx/fixtures/headers.docx b/pydocx/fixtures/headers.docx new file mode 100644 index 00000000..890104c7 Binary files /dev/null and b/pydocx/fixtures/headers.docx differ diff --git a/pydocx/fixtures/headers_with_full_line_styles.docx b/pydocx/fixtures/headers_with_full_line_styles.docx new file mode 100644 index 00000000..38d6f6a8 Binary files /dev/null and b/pydocx/fixtures/headers_with_full_line_styles.docx differ diff --git a/pydocx/fixtures/include_tabs.docx b/pydocx/fixtures/include_tabs.docx new file mode 100644 index 00000000..f7f53e92 Binary files /dev/null and b/pydocx/fixtures/include_tabs.docx differ diff --git a/pydocx/fixtures/inline_tags.docx b/pydocx/fixtures/inline_tags.docx new file mode 100644 index 00000000..4aba2347 Binary files /dev/null and b/pydocx/fixtures/inline_tags.docx differ diff --git a/pydocx/fixtures/justification.docx b/pydocx/fixtures/justification.docx new file mode 100644 index 00000000..7f8a3bf1 Binary files /dev/null and b/pydocx/fixtures/justification.docx differ diff --git a/pydocx/fixtures/list_in_table.docx b/pydocx/fixtures/list_in_table.docx new file mode 100644 index 00000000..d1a87388 Binary files /dev/null and b/pydocx/fixtures/list_in_table.docx differ diff --git a/pydocx/fixtures/list_to_header.docx b/pydocx/fixtures/list_to_header.docx new file mode 100644 index 00000000..f9b3946e Binary files /dev/null and b/pydocx/fixtures/list_to_header.docx differ diff --git a/pydocx/fixtures/lists_with_styles.docx b/pydocx/fixtures/lists_with_styles.docx new file mode 100644 index 00000000..c1c7ecf8 Binary files /dev/null and b/pydocx/fixtures/lists_with_styles.docx differ diff --git a/pydocx/fixtures/localDpi.docx b/pydocx/fixtures/localDpi.docx new file mode 100644 index 00000000..0f6d7f77 Binary files /dev/null and b/pydocx/fixtures/localDpi.docx differ diff --git a/pydocx/fixtures/missing_content.docx b/pydocx/fixtures/missing_content.docx new file mode 100644 index 00000000..21bed964 Binary files /dev/null and b/pydocx/fixtures/missing_content.docx differ diff --git a/pydocx/fixtures/missing_numbering.docx b/pydocx/fixtures/missing_numbering.docx new file mode 100644 index 00000000..5034f524 Binary files /dev/null and b/pydocx/fixtures/missing_numbering.docx differ diff --git a/pydocx/fixtures/missing_style.docx b/pydocx/fixtures/missing_style.docx new file mode 100644 index 00000000..3ded985c Binary files /dev/null and b/pydocx/fixtures/missing_style.docx differ diff --git a/pydocx/fixtures/nested_lists.docx b/pydocx/fixtures/nested_lists.docx new file mode 100644 index 00000000..0f9cecbd Binary files /dev/null and b/pydocx/fixtures/nested_lists.docx differ diff --git a/pydocx/fixtures/nested_table_rowspan.docx b/pydocx/fixtures/nested_table_rowspan.docx new file mode 100644 index 00000000..b43b8a0d Binary files /dev/null and b/pydocx/fixtures/nested_table_rowspan.docx differ diff --git a/pydocx/fixtures/nested_tables.docx b/pydocx/fixtures/nested_tables.docx new file mode 100644 index 00000000..af704d4d Binary files /dev/null and b/pydocx/fixtures/nested_tables.docx differ diff --git a/pydocx/fixtures/no_break_hyphen.docx b/pydocx/fixtures/no_break_hyphen.docx new file mode 100644 index 00000000..64d68fa3 Binary files /dev/null and b/pydocx/fixtures/no_break_hyphen.docx differ diff --git a/pydocx/fixtures/resized_image.docx b/pydocx/fixtures/resized_image.docx new file mode 100644 index 00000000..913099c4 Binary files /dev/null and b/pydocx/fixtures/resized_image.docx differ diff --git a/pydocx/fixtures/shift_enter.docx b/pydocx/fixtures/shift_enter.docx new file mode 100644 index 00000000..4128c0a2 Binary files /dev/null and b/pydocx/fixtures/shift_enter.docx differ diff --git a/pydocx/fixtures/simple.docx b/pydocx/fixtures/simple.docx new file mode 100644 index 00000000..1d2a1c23 Binary files /dev/null and b/pydocx/fixtures/simple.docx differ diff --git a/pydocx/fixtures/simple_lists.docx b/pydocx/fixtures/simple_lists.docx new file mode 100644 index 00000000..c09ad744 Binary files /dev/null and b/pydocx/fixtures/simple_lists.docx differ diff --git a/pydocx/fixtures/simple_table.docx b/pydocx/fixtures/simple_table.docx new file mode 100644 index 00000000..26de483c Binary files /dev/null and b/pydocx/fixtures/simple_table.docx differ diff --git a/pydocx/fixtures/special_chars.docx b/pydocx/fixtures/special_chars.docx new file mode 100644 index 00000000..b4b9287f Binary files /dev/null and b/pydocx/fixtures/special_chars.docx differ diff --git a/pydocx/fixtures/split_header.docx b/pydocx/fixtures/split_header.docx new file mode 100644 index 00000000..cc4bd5cf Binary files /dev/null and b/pydocx/fixtures/split_header.docx differ diff --git a/pydocx/fixtures/styled_bolding.docx b/pydocx/fixtures/styled_bolding.docx new file mode 100644 index 00000000..90c6b157 Binary files /dev/null and b/pydocx/fixtures/styled_bolding.docx differ diff --git a/pydocx/fixtures/super_and_subscript.docx b/pydocx/fixtures/super_and_subscript.docx new file mode 100644 index 00000000..06ea2d7a Binary files /dev/null and b/pydocx/fixtures/super_and_subscript.docx differ diff --git a/pydocx/fixtures/table_col_row_span.docx b/pydocx/fixtures/table_col_row_span.docx new file mode 100644 index 00000000..856abfdf Binary files /dev/null and b/pydocx/fixtures/table_col_row_span.docx differ diff --git a/pydocx/fixtures/tables_in_lists.docx b/pydocx/fixtures/tables_in_lists.docx new file mode 100644 index 00000000..11859541 Binary files /dev/null and b/pydocx/fixtures/tables_in_lists.docx differ diff --git a/pydocx/fixtures/track_changes_on.docx b/pydocx/fixtures/track_changes_on.docx new file mode 100644 index 00000000..dcb7ba1c Binary files /dev/null and b/pydocx/fixtures/track_changes_on.docx differ diff --git a/pydocx/fixtures/upper_alpha_all_bold.docx b/pydocx/fixtures/upper_alpha_all_bold.docx new file mode 100644 index 00000000..d518b2c5 Binary files /dev/null and b/pydocx/fixtures/upper_alpha_all_bold.docx differ diff --git a/pydocx/lxmlparser.py b/pydocx/lxmlparser.py deleted file mode 100644 index 94b130d3..00000000 --- a/pydocx/lxmlparser.py +++ /dev/null @@ -1,111 +0,0 @@ -import zipfile -from lxml import etree -from StringIO import StringIO -__author__ = 'samportnow' - -#for el in tree.iter(): - # The way lists are handled could double visit certain elements; keep - # track of which elements have been visited and skip any that have been - # visited already. - #if el in visited_nodes: - #continue -with zipfile.ZipFile('/Users/samportnow/Documents/pydocx/helloworld.docx') as f: - document = f.read('word/document.xml') - numbering= f.read('word/numbering.xml') -parser=etree.XMLParser(ns_clean=True) -document=StringIO(document) -numbering=StringIO(numbering) -numbering_tree=etree.parse(numbering,parser) -numbering_namespace=numbering_tree.getroot().nsmap['w'] -visited_els=[] - -def get_parsed(): - parser=etree.XMLParser(ns_clean=True) - tree=etree.parse(document,parser) - namespace=tree.getroot().nsmap['w'] - #rpr is run properties for the paragraph mark - paragraph='' - run_text='' - running_text='' - for el in tree.iter(): - if el.tag=='{%s}p' %namespace: - for wp in el.iter(): - if wp.tag =='{%s}ins' %namespace: - for text in wp.iterchildren(): - if text not in visited_els: - run_text +='
'+get_text(text,namespace,visited_els)+'
' - visited_els.append(text) - if wp.tag=='{%s}r' %namespace and wp not in visited_els: - run_text+=get_text(wp,namespace,visited_els) - visited_els.append(wp) - if not el.getchildren(): - run_text+='
' - if wp.tag == '{%s}ilvl' %namespace: - for lst in el.iter(): - if lst.find('{%s}numId' %namespace) is not None and el not in visited_els: - numval = lst.find('{%s}numId' %namespace).attrib['{%s}val' %namespace] - lst_type=get_list_style(numval) - if get_text(lst,namespace,visited_els) and el not in visited_els and lst_type['{%s}val' %namespace] != 'bullet': - if lst.getnext() is not None: - if lst not in visited_els: - while lst.getnext() is not None: - if lst not in visited_els: - text = get_text(lst,namespace,visited_els) - next_txt = get_text(lst.getnext(),namespace,visited_els) - running_text += text + next_txt - visited_els.append(lst) - visited_els.append(lst.getnext()) - lst=lst.getnext() - else: - run_text += '
  • ' + running_text + '
  • ' - break - else: - run_text +='
  • ' + get_text(lst, namespace, visited_els) + '
  • ' - visited_els.append(lst) - print running_text - return run_text - - -def get_text(wp,namespace,visited_els): - run_text= '' - decorator = '' - closing = '' - if wp.find('{%s}tab' %namespace) is not None: - run_text+='%nbsp' - if wp.find('{%s}rPr' %namespace) is not None: - for tag in wp.iter(): - if tag.find('{%s}u' %namespace) is not None: - if wp.find('{%s}t' %namespace) is not None: - decorator +='' - closing += '' - visited_els.append(wp.find('{%s}t' %namespace)) - if tag.find('{%s}i' %namespace) is not None: - if wp.find('{%s}t' %namespace) is not None: - decorator += '' - closing += '' - visited_els.append(wp.find('{%s}t' %namespace)) - if tag.find('{%s}b' %namespace) is not None: - if wp.find('{%s}t' %namespace) is not None: - decorator += '' - closing += '' - visited_els.append(wp.find('{%s}t' %namespace)) - run_text = wp.find('{%s}t' %namespace).text - run_text = decorator + run_text + closing - if wp.find('{%s}t' %namespace) is not None and wp.find('{%s}t' %namespace) not in visited_els: - run_text+=wp.find('{%s}t' %namespace).text - return run_text - -def get_list_style(numval): - ids = numbering_tree.findall('{%s}num' %numbering_namespace) - for id in ids: - if id.attrib['{%s}numId' %numbering_namespace] == numval: - abstractid=id.find('{%s}abstractNumId' %numbering_namespace) - abstractid=abstractid.attrib['{%s}val' %numbering_namespace] - style_information=numbering_tree.findall('{%s}abstractNum' %numbering_namespace) - for info in style_information: - if info.attrib['{%s}abstractNumId' %numbering_namespace] == abstractid: - for i in info.iter(): - if i.find('{%s}numFmt' %numbering_namespace) is not None: - return i.find('{%s}numFmt' %numbering_namespace).attrib - -print get_parsed() diff --git a/pydocx/parsers/Docx2Html.py b/pydocx/parsers/Docx2Html.py index bfaad2a6..e4067a10 100644 --- a/pydocx/parsers/Docx2Html.py +++ b/pydocx/parsers/Docx2Html.py @@ -1,21 +1,46 @@ -from pydocx.DocxParser import DocxParser - +import base64 import xml.sax.saxutils +from pydocx.DocxParser import DocxParser + class Docx2Html(DocxParser): @property def parsed(self): - self._parsed = self._parsed.replace('

    ', '
    ') - self._parsed = self._parsed.replace('


    ', '

    ') - self._parsed = self._parsed.replace('


      ', '

        ') - return ( - '{content}' - ).format(content=self._parsed) + content = self._parsed + content = "%(head)s%(content)s" % { + 'head': self.head(), + 'content': content, + } + return unicode(content) + + def head(self): + return "%(style)s" % { + 'style': self.style(), + } + + def style(self): + result = ( + '' + ) % { + #multiple by (4/3) to get to px + 'width': (self.page_width * (4 / 3)), + } + return result def escape(self, text): return xml.sax.saxutils.quoteattr(text)[1:-1] @@ -26,61 +51,155 @@ def linebreak(self, pre=None): def paragraph(self, text, pre=None): return '

        ' + text + '

        ' + def heading(self, text, heading_value): + return '<%(tag)s>%(text)s' % { + 'tag': heading_value, + 'text': text, + } + def insertion(self, text, author, date): return ( - "{text}" - ).format(author=author, date=date, text=text) + "%(text)s" + ) % { + 'author': author, + 'date': date, + 'text': text, + } + + def hyperlink(self, text, href): + if text == '': + return '' + return '%(text)s' % { + 'href': href, + 'text': text, + } + + def image_handler(self, image_data, filename): + extension = filename.split('.')[-1].lower() + b64_encoded_src = 'data:image/%s;base64,%s' % ( + extension, + base64.b64encode(image_data), + ) + b64_encoded_src = self.escape(b64_encoded_src) + return b64_encoded_src + + def image(self, image_data, filename, x, y): + src = self.image_handler(image_data, filename) + if not src: + return '' + if all([x, y]): + return '' % ( + src, + y, + x, + ) + else: + return '' % src def deletion(self, text, author, date): return ( - "{text}" - ).format(author=author, date=date, text=text) + "%(text)s" + ) % { + 'author': author, + 'date': date, + 'text': text, + } def list_element(self, text): - return "
      • {text}
      • ".format(text=text) + return "
      • %(text)s
      • " % { + 'text': text, + } - def ordered_list(self, text): - return "
          {text}
        ".format(text=text) + def ordered_list(self, text, list_style): + return '
          %(text)s
        ' % { + 'text': text, + 'list_style': list_style, + } def unordered_list(self, text): - return "
          {text}
        ".format(text=text) + return "
          %(text)s
        " % { + 'text': text, + } def bold(self, text): - return '' + text + '' + return '' + text + '' def italics(self, text): - return '' + text + '' + return '' + text + '' def underline(self, text): - return '' + text + '' + return '' + text + '' + + def caps(self, text): + return '' + text + '' + + def small_caps(self, text): + return '' + text + '' + + def strike(self, text): + return '' + text + '' + + def hide(self, text): + return '' + text + '' + + def superscript(self, text): + return '%(text)s' % { + 'text': text, + } + + def subscript(self, text): + return '%(text)s' % { + 'text': text, + } def tab(self): # Insert before the text right?? So got the text and just do an insert # at the beginning! - return '    ' + return '    ' def table(self, text): - return '' + text + '
        ' + return '' + text + '
        ' def table_row(self, text): return '' + text + '' - def table_cell(self, text): - return '' + text + '' + def table_cell(self, text, col='', row=''): + slug = '' - - def center_justify(self, text): - return "
        " + text + '
        ' - - def right_justify(self, text): - return "
        " + text + '
        ' - - def indent(self, text, right, left, firstLine): - return "
        {text}
        ".format( - left=left, - text=text, - ) + return '
        ' + + def indent(self, text, just='', firstLine='', left='', right=''): + slug = '' diff --git a/pydocx/parsers/Docx2Markdown.py b/pydocx/parsers/Docx2Markdown.py index 1bb43e16..d023df7a 100644 --- a/pydocx/parsers/Docx2Markdown.py +++ b/pydocx/parsers/Docx2Markdown.py @@ -1,5 +1,6 @@ from pydocx.DocxParser import DocxParser + class Docx2Markdown(DocxParser): def escape(self, text): return text @@ -17,8 +18,9 @@ def bold(self, text): return '**' + text + '**' def italics(self, text): - # TODO do we need a "pre" variable, so I can check for *italics**italics* and turn it into *italicsitatlics*? + # TODO do we need a "pre" variable, so I can check for + # *italics**italics* and turn it into *italicsitatlics*? return '*' + text + '*' def underline(self, text): - return '***' +text + '***' \ No newline at end of file + return '***' + text + '***' diff --git a/pydocx/parsers/__init__.py b/pydocx/parsers/__init__.py index a9524657..7684ae65 100644 --- a/pydocx/parsers/__init__.py +++ b/pydocx/parsers/__init__.py @@ -1,2 +1,4 @@ -from .Docx2Html import * -from .Docx2Markdown import * \ No newline at end of file +from pydocx.parsers.Docx2Html import Docx2Html +from pydocx.parsers.Docx2Markdown import Docx2Markdown + +__all__ = (Docx2Html, Docx2Markdown) diff --git a/pydocx/tests/__init__.py b/pydocx/tests/__init__.py new file mode 100644 index 00000000..82341e05 --- /dev/null +++ b/pydocx/tests/__init__.py @@ -0,0 +1,196 @@ +#from unittest import TestCase +import re +from contextlib import contextmanager + +from pydocx.parsers.Docx2Html import Docx2Html +from pydocx.utils import ( + parse_xml_from_string, +) +from pydocx.tests.document_builder import DocxBuilder as DXB +from unittest import TestCase + +STYLE = ( + '' +) + +BASE_HTML = ''' + + + %s + + %%s + +''' % STYLE + + +def assert_html_equal(actual_html, expected_html): + assert collapse_html( + actual_html, + ) == collapse_html( + expected_html + ), actual_html + + +def collapse_html(html): + """ + Remove insignificant whitespace from the html. + + >>> print collapse_html('''\\ + ...

        + ... Heading + ...

        + ... ''') +

        Heading

        + >>> print collapse_html('''\\ + ...

        + ... Paragraph with + ... multiple lines. + ...

        + ... ''') +

        Paragraph with multiple lines.

        + """ + def smart_space(match): + # Put a space in between lines, unless exactly one side of the line + # break butts up against a tag. + before = match.group(1) + after = match.group(2) + space = ' ' + if before == '>' or after == '<': + space = '' + return before + space + after + # Replace newlines and their surrounding whitespace with a single space (or + # empty string) + html = re.sub( + r'(>?)\s*\n\s*( + + {{ body }} + diff --git a/pydocx/tests/templates/drawing.xml b/pydocx/tests/templates/drawing.xml new file mode 100644 index 00000000..dfd470b4 --- /dev/null +++ b/pydocx/tests/templates/drawing.xml @@ -0,0 +1,65 @@ + + + + + + + + + + + 2397125 + + + 0 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/pydocx/tests/templates/hyperlink.xml b/pydocx/tests/templates/hyperlink.xml new file mode 100644 index 00000000..83645948 --- /dev/null +++ b/pydocx/tests/templates/hyperlink.xml @@ -0,0 +1,5 @@ + + {% for run_tag in run_tags %} + {{ run_tag }} + {% endfor %} + diff --git a/pydocx/tests/templates/insert.xml b/pydocx/tests/templates/insert.xml new file mode 100644 index 00000000..afeb2691 --- /dev/null +++ b/pydocx/tests/templates/insert.xml @@ -0,0 +1,5 @@ + + {% for run_tag in run_tags %} + {{ run_tag }} + {% endfor %} + diff --git a/pydocx/tests/templates/linebreak.xml b/pydocx/tests/templates/linebreak.xml new file mode 100644 index 00000000..ab92e811 --- /dev/null +++ b/pydocx/tests/templates/linebreak.xml @@ -0,0 +1 @@ + diff --git a/pydocx/tests/templates/numbering.xml b/pydocx/tests/templates/numbering.xml new file mode 100644 index 00000000..4eaac3cc --- /dev/null +++ b/pydocx/tests/templates/numbering.xml @@ -0,0 +1,23 @@ + + + {% for num_id, ilvl_data in numbering_dict.items() %} + + {% for ilvl, format in ilvl_data.items() %} + + + + + + + + + + {% endfor %} + + {% endfor %} + {% for num_id in numbering_dict %} + + + + {% endfor %} + diff --git a/pydocx/tests/templates/p.xml b/pydocx/tests/templates/p.xml new file mode 100644 index 00000000..7a78a060 --- /dev/null +++ b/pydocx/tests/templates/p.xml @@ -0,0 +1,19 @@ + + + + {% if is_list %} + + {% if ilvl != None %} + + {% endif %} + {% if numId != None %} + + {% endif %} + + {% endif %} + {% if jc %}{% endif %} + + {% for run_tag in run_tags %} + {{ run_tag }} + {% endfor %} + diff --git a/pydocx/tests/templates/pict.xml b/pydocx/tests/templates/pict.xml new file mode 100644 index 00000000..26f772a3 --- /dev/null +++ b/pydocx/tests/templates/pict.xml @@ -0,0 +1,17 @@ + + + + + + + + + + + + + {% if r_id %}{% endif %} + + + + diff --git a/pydocx/tests/templates/r.xml b/pydocx/tests/templates/r.xml new file mode 100644 index 00000000..2f28a66b --- /dev/null +++ b/pydocx/tests/templates/r.xml @@ -0,0 +1,6 @@ + + {{ rpr }} + {% for element in elements %} + {{ element }} + {% endfor %} + diff --git a/pydocx/tests/templates/rpr.xml b/pydocx/tests/templates/rpr.xml new file mode 100644 index 00000000..f49eb08b --- /dev/null +++ b/pydocx/tests/templates/rpr.xml @@ -0,0 +1,5 @@ + + {% for tag, value in tags.items() %} + + {% endfor %} + diff --git a/pydocx/tests/templates/sdt.xml b/pydocx/tests/templates/sdt.xml new file mode 100644 index 00000000..fe9a7e77 --- /dev/null +++ b/pydocx/tests/templates/sdt.xml @@ -0,0 +1,16 @@ + + + + + + + + + + + + + + {{ p_tag }} + + diff --git a/pydocx/tests/templates/sectPr.xml b/pydocx/tests/templates/sectPr.xml new file mode 100644 index 00000000..16a12050 --- /dev/null +++ b/pydocx/tests/templates/sectPr.xml @@ -0,0 +1,3 @@ + + {{ p_tag }} + diff --git a/pydocx/tests/templates/smart_tag.xml b/pydocx/tests/templates/smart_tag.xml new file mode 100644 index 00000000..e45ee5b9 --- /dev/null +++ b/pydocx/tests/templates/smart_tag.xml @@ -0,0 +1,5 @@ + + {% for run_tag in run_tags %} + {{ run_tag }} + {% endfor %} + diff --git a/pydocx/tests/templates/style.xml b/pydocx/tests/templates/style.xml new file mode 100644 index 00000000..5fa9f00f --- /dev/null +++ b/pydocx/tests/templates/style.xml @@ -0,0 +1,15 @@ + + + + + + + + + + + + + + + diff --git a/pydocx/tests/templates/styles.xml b/pydocx/tests/templates/styles.xml new file mode 100644 index 00000000..a30e752e --- /dev/null +++ b/pydocx/tests/templates/styles.xml @@ -0,0 +1,6 @@ + + + {% for style in style_tags %} + {{ style }} + {% endfor %} + diff --git a/pydocx/tests/templates/t.xml b/pydocx/tests/templates/t.xml new file mode 100644 index 00000000..81d562b7 --- /dev/null +++ b/pydocx/tests/templates/t.xml @@ -0,0 +1,5 @@ +{% if text %} +{{ text }} +{% else %} + +{% endif %} diff --git a/pydocx/tests/templates/table.xml b/pydocx/tests/templates/table.xml new file mode 100644 index 00000000..e47783b6 --- /dev/null +++ b/pydocx/tests/templates/table.xml @@ -0,0 +1,18 @@ + + + + + + + + + + + + + + + {% for table_row in table_rows %} + {{ table_row }} + {% endfor %} + diff --git a/pydocx/tests/templates/tc.xml b/pydocx/tests/templates/tc.xml new file mode 100644 index 00000000..eff9ce0d --- /dev/null +++ b/pydocx/tests/templates/tc.xml @@ -0,0 +1,28 @@ + + + + {% if merge_continue %} + + + {% endif %} + {% if merge %} + + + {% endif %} + + + + + + + + + + + + + + {% if paragraph %} + {{ paragraph }} + {% endif %} + diff --git a/pydocx/tests/templates/text_delete.xml b/pydocx/tests/templates/text_delete.xml new file mode 100644 index 00000000..783b3ad3 --- /dev/null +++ b/pydocx/tests/templates/text_delete.xml @@ -0,0 +1,10 @@ + + {% for deleted_text in deleted_texts %} + + + + + {{ deleted_text }} + + {% endfor %} + diff --git a/pydocx/tests/templates/tr.xml b/pydocx/tests/templates/tr.xml new file mode 100644 index 00000000..6e2f6925 --- /dev/null +++ b/pydocx/tests/templates/tr.xml @@ -0,0 +1,8 @@ + + + + + {% for table_cell in table_cells %} + {{ table_cell }} + {% endfor %} + diff --git a/pydocx/tests/test_docx.py b/pydocx/tests/test_docx.py new file mode 100644 index 00000000..e9d77533 --- /dev/null +++ b/pydocx/tests/test_docx.py @@ -0,0 +1,849 @@ +import base64 +from os import path +from tempfile import NamedTemporaryFile + +from nose.plugins.skip import SkipTest +from nose.tools import raises + +from pydocx.tests import assert_html_equal, BASE_HTML +from pydocx.parsers.Docx2Html import Docx2Html +from pydocx.DocxParser import ZipFile +from pydocx.exceptions import MalformedDocxException + + +def convert(path, *args, **kwargs): + return Docx2Html(path, *args, **kwargs).parsed + + +def test_extract_html(): + file_path = path.join( + path.abspath(path.dirname(__file__)), + '..', + 'fixtures', + 'simple.docx', + ) + actual_html = convert(file_path) + assert_html_equal(actual_html, BASE_HTML % ''' +

        + Simple text +

        +
          +
        1. one
        2. +
        3. two
        4. +
        5. three
        6. +
        + + + + + + + + + +
        Cell1Cell2
        Cell3Cell4
        + ''') + + +def test_nested_list(): + file_path = path.join( + path.abspath(path.dirname(__file__)), + '..', + 'fixtures', + 'nested_lists.docx', + ) + actual_html = convert(file_path) + assert_html_equal(actual_html, BASE_HTML % ''' +
          +
        1. one
        2. +
        3. two
        4. +
        5. three +
            +
          1. AAA
          2. +
          3. BBB
          4. +
          5. CCC +
              +
            1. alpha
            2. +
            +
          6. +
          +
        6. +
        7. four
        8. +
        +
          +
        1. xxx +
            +
          1. yyy
          2. +
          +
        2. +
        +
          +
        • www +
            +
          • zzz
          • +
          +
        • +
        + ''') + + +def test_simple_list(): + file_path = path.join( + path.abspath(path.dirname(__file__)), + '..', + 'fixtures', + 'simple_lists.docx', + ) + actual_html = convert(file_path) + assert_html_equal(actual_html, BASE_HTML % ''' +
          +
        1. One
        2. +
        +
          +
        • two
        • +
        + ''') + + +def test_inline_tags(): + file_path = path.join( + path.abspath(path.dirname(__file__)), + '..', + 'fixtures', + 'inline_tags.docx', + ) + actual_html = convert(file_path) + assert_html_equal(actual_html, BASE_HTML % ( + '

        This sentence has some bold, ' + 'some italics and some ' + 'underline, ' + 'as well as a hyperlink.

        ' + )) + + +def test_all_configured_styles(): + file_path = path.join( + path.abspath(path.dirname(__file__)), + '..', + 'fixtures', + 'all_configured_styles.docx', + ) + actual_html = convert(file_path) + assert_html_equal(actual_html, BASE_HTML % ''' +

        aaa

        +

        bbb

        +

        ccc

        +

        ddd

        +

        eee

        +

        fff

        +

        ggg

        +

        hhh

        +

        iii

        + ''') + + +def test_super_and_subscript(): + file_path = path.join( + path.abspath(path.dirname(__file__)), + '..', + 'fixtures', + 'super_and_subscript.docx', + ) + actual_html = convert(file_path) + assert_html_equal(actual_html, BASE_HTML % ''' +

        AAABBB

        +

        CCCDDD

        + ''') + + +def test_unicode(): + file_path = path.join( + path.abspath(path.dirname(__file__)), + '..', + 'fixtures', + 'greek_alphabet.docx', + ) + actual_html = convert(file_path) + assert actual_html is not None + assert u'\u0391\u03b1' in actual_html + + +def test_special_chars(): + file_path = path.join( + path.abspath(path.dirname(__file__)), + '..', + 'fixtures', + 'special_chars.docx', + ) + actual_html = convert(file_path) + assert_html_equal(actual_html, BASE_HTML % ''' +

        & < > link

        ''') # noqa + + +def test_include_tabs(): + file_path = path.join( + path.abspath(path.dirname(__file__)), + '..', + 'fixtures', + 'include_tabs.docx', + ) + actual_html = convert(file_path) + assert_html_equal( + actual_html, + BASE_HTML % '

        AAA    BBB

        ' + ) + + +def test_table_col_row_span(): + file_path = path.join( + path.abspath(path.dirname(__file__)), + '..', + 'fixtures', + 'table_col_row_span.docx', + ) + actual_html = convert(file_path) + assert_html_equal(actual_html, BASE_HTML % ''' + + + + + + + + + + + + + + + + + + +
        AAA
        BBBCCC
        DDD
        +
        EEE +
        FFF
        +
        GGG +
        + + + + + + + + + + + + + + + + + + + + + + +
        1234
        567
        89
        10111213
        + ''') + + +def test_nested_table_rowspan(): + file_path = path.join( + path.abspath(path.dirname(__file__)), + '..', + 'fixtures', + 'nested_table_rowspan.docx', + ) + actual_html = convert(file_path) + assert_html_equal(actual_html, BASE_HTML % ''' + + + + + + + + +
        AAA
        BBB + + + + + + + + +
        CCCDDD
        EEE
        +
        + ''') + + +def test_nested_tables(): + file_path = path.join( + path.abspath(path.dirname(__file__)), + '..', + 'fixtures', + 'nested_tables.docx', + ) + actual_html = convert(file_path) + # Find out why br tag is there. + assert_html_equal(actual_html, BASE_HTML % ''' + + + + + + + + + +
        AAABBB
        CCC + + + + + + + + + +
        DDDEEE
        FFFGGG
        +
        + ''') + + +def test_list_in_table(): + file_path = path.join( + path.abspath(path.dirname(__file__)), + '..', + 'fixtures', + 'list_in_table.docx', + ) + actual_html = convert(file_path) + assert_html_equal(actual_html, BASE_HTML % ''' + + + + +
        +
          +
        1. AAA
        2. +
        3. BBB
        4. +
        5. CCC
        6. +
        +
        + ''') + + +def test_tables_in_lists(): + file_path = path.join( + path.abspath(path.dirname(__file__)), + '..', + 'fixtures', + 'tables_in_lists.docx', + ) + actual_html = convert(file_path) + assert_html_equal(actual_html, BASE_HTML % ''' +
          +
        1. AAA
        2. +
        3. BBB + + + + + + + + + +
          CCCDDD
          EEEFFF
          +
        4. +
        5. GGG
        6. +
        + ''') + + +def test_track_changes_on(): + file_path = path.join( + path.abspath(path.dirname(__file__)), + '..', + 'fixtures', + 'track_changes_on.docx', + ) + actual_html = convert(file_path) + assert_html_equal(actual_html, BASE_HTML % ''' +

        This was some content.

        + ''') + + +def test_headers(): + file_path = path.join( + path.abspath(path.dirname(__file__)), + '..', + 'fixtures', + 'headers.docx', + ) + actual_html = convert(file_path) + assert_html_equal(actual_html, BASE_HTML % ''' +

        This is an H1

        +

        This is an H2

        +

        This is an H3

        +

        This is an H4

        +
        This is an H5
        +
        This is an H6
        +
        This is an H7
        +
        This is an H8
        +
        This is an H9
        +
        This is an H10
        + ''') + + +def test_split_headers(): + file_path = path.join( + path.abspath(path.dirname(__file__)), + '..', + 'fixtures', + 'split_header.docx', + ) + + actual_html = convert(file_path) + assert_html_equal(actual_html, BASE_HTML % ''' +

        AAA

        BBB

        CCC

        + ''') + + +def get_image_data(docx_file_path, image_name): + """ + Return base 64 encoded data for the image_name that is stored in the + docx_file_path. + """ + with ZipFile(docx_file_path) as f: + images = [ + e for e in f.infolist() + if e.filename == 'word/media/%s' % image_name + ] + if not images: + raise AssertionError('%s not in %s' % (image_name, docx_file_path)) + data = f.read(images[0].filename) + return base64.b64encode(data) + + +def test_has_image(): + file_path = path.join( + path.abspath(path.dirname(__file__)), + '..', + 'fixtures', + 'has_image.docx', + ) + + actual_html = convert(file_path) + image_data = get_image_data(file_path, 'image1.gif') + assert_html_equal(actual_html, BASE_HTML % ''' +

        + AAA + +

        + ''' % image_data) + + +def test_local_dpi(): + # The image in this file does not have a set height or width, show that the + # html will generate without it. + file_path = path.join( + path.abspath(path.dirname(__file__)), + '..', + 'fixtures', + 'localDpi.docx', + ) + actual_html = convert(file_path) + image_data = get_image_data(file_path, 'image1.jpeg') + assert_html_equal(actual_html, BASE_HTML % ''' +

        + ''' % image_data) + + +def test_has_image_using_image_handler(): + raise SkipTest('This needs to be converted to an xml test') + file_path = path.join( + path.abspath(path.dirname(__file__)), + '..', + 'fixtures', + 'has_image.docx', + ) + + def image_handler(*args, **kwargs): + return 'test' + actual_html = convert(file_path) + assert_html_equal(actual_html, BASE_HTML % ''' +

        AAA

        + ''') + + +def test_headers_with_full_line_styles(): + raise SkipTest('This test is not yet passing') + # Show that if a natural header is completely bold/italics that + # bold/italics will get stripped out. + file_path = path.join( + path.abspath(path.dirname(__file__)), + '..', + 'fixtures', + 'headers_with_full_line_styles.docx', + ) + actual_html = convert(file_path) + assert_html_equal(actual_html, BASE_HTML % ''' +

        AAA

        +

        BBB

        +

        CCC

        + ''') + + +def test_convert_p_to_h(): + raise SkipTest('This test is not yet passing') + # Show when it is correct to convert a p tag to an h tag based on + # bold/italics + file_path = path.join( + path.abspath(path.dirname(__file__)), + '..', + 'fixtures', + 'convert_p_to_h.docx', + ) + actual_html = convert(file_path) + assert_html_equal(actual_html, BASE_HTML % ''' +

        AAA

        +

        BBB

        +

        CCC

        +
          +
        1. DDD
        2. +
        3. EEE
        4. +
        5. FFF
        6. +
        + + + + + + + + + +
        GGGHHH
        IIIJJJ
        + ''') + + +def test_fake_headings_by_length(): + raise SkipTest('This test is not yet passing') + # Show that converting p tags to h tags has a length limit. If the p tag is + # supposed to be converted to an h tag but has more than seven words in the + # paragraph do not convert it. + file_path = path.join( + path.abspath(path.dirname(__file__)), + '..', + 'fixtures', + 'fake_headings_by_length.docx', + ) + actual_html = convert(file_path) + assert_html_equal(actual_html, BASE_HTML % ''' +

        Heading.

        +

        Still a heading.

        +

        + This is not a heading because it is too many words. +

        + ''') + + +def test_shift_enter(): + file_path = path.join( + path.abspath(path.dirname(__file__)), + '..', + 'fixtures', + 'shift_enter.docx', + ) + + # Test just the convert without clean_html to make sure the first + # break tag is present. + actual_html = convert(file_path) + assert_html_equal(actual_html, BASE_HTML % ''' +

        AAA
        BBB

        +

        CCC

        +
          +
        1. DDD
          EEE
        2. +
        3. FFF
        4. +
        + + + + + + + + + +
        GGG
        HHH
        III
        JJJ
        KKKLLL
        + ''') + + +def test_lists_with_styles(): + file_path = path.join( + path.abspath(path.dirname(__file__)), + '..', + 'fixtures', + 'lists_with_styles.docx', + ) + actual_html = convert(file_path) + assert_html_equal(actual_html, BASE_HTML % ''' +
          +
        1. AAA
        2. +
        3. BBB +
            +
          1. CCC
          2. +
          3. DDD +
              +
            1. EEE +
                +
              1. FFF
              2. +
              +
            2. +
            +
          4. +
          +
        4. +
        + ''') + + +def test_list_to_header(): + file_path = path.join( + path.abspath(path.dirname(__file__)), + '..', + 'fixtures', + 'list_to_header.docx', + ) + actual_html = convert(file_path, convert_root_level_upper_roman=True) + # It should be noted that list item `GGG` is upper roman in the word + # document to show that only top level upper romans get converted. + assert_html_equal(actual_html, BASE_HTML % ''' +

        AAA

        +
          +
        1. BBB
        2. +
        +

        CCC

        +
          +
        1. DDD
        2. +
        +

        EEE

        +
          +
        1. FFF +
            +
          1. GGG
          2. +
          +
        2. +
        + ''') + + +def test_has_title(): + file_path = path.join( + path.abspath(path.dirname(__file__)), + '..', + 'fixtures', + 'has_title.docx', + ) + actual_html = convert(file_path) + assert_html_equal(actual_html, BASE_HTML % ''' +

        Title

        +

        Text

        + ''') + + +def test_upper_alpha_all_bold(): + raise SkipTest('This test is not yet passing') + file_path = path.join( + path.abspath(path.dirname(__file__)), + '..', + 'fixtures', + 'upper_alpha_all_bold.docx', + ) + actual_html = convert(file_path) + assert_html_equal(actual_html, BASE_HTML % ''' +

        AAA

        +

        BBB

        +

        CCC

        + ''') + + +def test_simple_table(): + file_path = path.join( + path.abspath(path.dirname(__file__)), + '..', + 'fixtures', + 'simple_table.docx', + ) + actual_html = convert(file_path) + assert_html_equal(actual_html, BASE_HTML % ''' + + + + + + + + +
        + Cell1
        + Cell3 +
        Cell2
        + And I am writing in the table +
        Cell4
        + ''') + + +def test_justification(): + file_path = path.join( + path.abspath(path.dirname(__file__)), + '..', + 'fixtures', + 'justification.docx', + ) + actual_html = convert(file_path) + assert_html_equal(actual_html, BASE_HTML % ''' +

        +

        Center Justified
        +

        +

        +

        Right justified
        +

        +

        +

        + Right justified and pushed in from right +
        +

        +

        +

        + Center justified and pushed in from left and it is + great and it is the coolest thing of all time and I like it and + I think it is cool +
        +

        +

        +

        + Left justified and pushed in from left +
        +

        + ''') + + +def test_missing_style(): + file_path = path.join( + path.abspath(path.dirname(__file__)), + '..', + 'fixtures', + 'missing_style.docx', + ) + actual_html = convert(file_path) + assert_html_equal(actual_html, BASE_HTML % ''' +

        AAA

        + ''') + + +def test_missing_numbering(): + file_path = path.join( + path.abspath(path.dirname(__file__)), + '..', + 'fixtures', + 'missing_numbering.docx', + ) + actual_html = convert(file_path) + assert_html_equal(actual_html, BASE_HTML % ''' +

        AAA

        +

        BBB

        + ''') + + +def test_styled_bolding(): + file_path = path.join( + path.abspath(path.dirname(__file__)), + '..', + 'fixtures', + 'styled_bolding.docx', + ) + actual_html = convert(file_path) + assert_html_equal(actual_html, BASE_HTML % ''' +

        AAA

        +

        BBB

        + ''') + + +def test_no_break_hyphen(): + file_path = path.join( + path.abspath(path.dirname(__file__)), + '..', + 'fixtures', + 'no_break_hyphen.docx', + ) + actual_html = convert(file_path) + assert_html_equal(actual_html, BASE_HTML % ''' +

        AAA-BBB

        + ''') + + +@raises(MalformedDocxException) +def test_malformed_docx_exception(): + with NamedTemporaryFile(suffix='.docx') as f: + convert(f.name) + + +def _converter(*args, **kwargs): + # Having a converter that does nothing is the same as if abiword fails to + # convert. + pass + + +#def test_converter_broken(): +# file_path = 'test.doc' +# assert_raises( +# ConversionFailed, +# lambda: convert(file_path, converter=_converter), +# ) + + +def test_fall_back(): + raise SkipTest('This test is not yet passing') + file_path = 'test.doc' + + def fall_back(*args, **kwargs): + return 'success' + html = convert(file_path, fall_back=fall_back, converter=_converter) + assert html == 'success' + + +#@mock.patch('docx2html.core.read_html_file') +#@mock.patch('docx2html.core.get_zip_file_handler') +#def test_html_files(patch_zip_handler, patch_read): +def test_html_files(): + raise SkipTest('This test is not yet passing') + + def raise_assertion(*args, **kwargs): + raise AssertionError('Should not have called get_zip_file_handler') + #patch_zip_handler.side_effect = raise_assertion + + def return_text(*args, **kwargs): + return 'test' + #patch_read.side_effect = return_text + + # Try with an html file + file_path = 'test.html' + + html = convert(file_path) + assert html == 'test' + + # Try again with an htm file. + file_path = 'test.htm' + + html = convert(file_path) + assert html == 'test' diff --git a/pydocx/tests/test_xml.py b/pydocx/tests/test_xml.py new file mode 100644 index 00000000..904ed2b4 --- /dev/null +++ b/pydocx/tests/test_xml.py @@ -0,0 +1,1351 @@ +# -*- coding: utf-8 -*- +import os +import time + +from nose.plugins.skip import SkipTest + +from pydocx.tests.document_builder import DocxBuilder as DXB +from pydocx.tests import ( + XMLDocx2Html, + _TranslationTestCase, +) +from pydocx.utils import parse_xml_from_string, find_all + + +class StyleIsOnTestCase(_TranslationTestCase): + expected_output = """ +

        AAA

        +

        BBB

        +

        CCC

        +

        DDD

        + """ + + def get_xml(self): + tags = [ + DXB.p_tag( + [ + DXB.r_tag( + [DXB.t_tag('AAA')], + rpr=DXB.rpr_tag({'b': None}), + ), + ], + ), + DXB.p_tag( + [ + DXB.r_tag( + [DXB.t_tag('BBB')], + rpr=DXB.rpr_tag({'b': 'false'}), + ), + ], + ), + DXB.p_tag( + [ + DXB.r_tag( + [DXB.t_tag('CCC')], + rpr=DXB.rpr_tag({'b': '0'}), + ), + ], + ), + DXB.p_tag( + [ + DXB.r_tag( + [DXB.t_tag('DDD')], + rpr=DXB.rpr_tag({'u': 'none'}), + ), + ], + ), + ] + + body = '' + for tag in tags: + body += tag + xml = DXB.xml(body) + return xml + + +class HyperlinkVanillaTestCase(_TranslationTestCase): + + relationship_dict = { + 'rId0': 'www.google.com', + } + + expected_output = ''' +

        link.

        + ''' + + def get_xml(self): + run_tags = [] + run_tags.append(DXB.r_tag([DXB.t_tag('link')])) + run_tags = [DXB.hyperlink_tag(r_id='rId0', run_tags=run_tags)] + run_tags.append(DXB.r_tag([DXB.t_tag('.')])) + body = DXB.p_tag(run_tags) + xml = DXB.xml(body) + return xml + + +class HyperlinkWithMultipleRunsTestCase(_TranslationTestCase): + relationship_dict = { + 'rId0': 'www.google.com', + } + + expected_output = ''' +

        link.

        + ''' + + def get_xml(self): + run_tags = [DXB.r_tag([DXB.t_tag(i)]) for i in 'link'] + run_tags = [DXB.hyperlink_tag(r_id='rId0', run_tags=run_tags)] + run_tags.append(DXB.r_tag([DXB.t_tag('.')])) + body = DXB.p_tag(run_tags) + xml = DXB.xml(body) + return xml + + +class HyperlinkNoTextTestCase(_TranslationTestCase): + relationship_dict = { + 'rId0': 'www.google.com', + } + + expected_output = '' + + def get_xml(self): + run_tags = [] + run_tags = [DXB.hyperlink_tag(r_id='rId0', run_tags=run_tags)] + body = DXB.p_tag(run_tags) + xml = DXB.xml(body) + return xml + + +class HyperlinkNotInRelsDictTestCase(_TranslationTestCase): + relationship_dict = { + # 'rId0': 'www.google.com', missing + } + + expected_output = '

        link.

        ' + + def get_xml(self): + run_tags = [] + run_tags.append(DXB.r_tag([DXB.t_tag('link')])) + run_tags = [DXB.hyperlink_tag(r_id='rId0', run_tags=run_tags)] + run_tags.append(DXB.r_tag([DXB.t_tag('.')])) + body = DXB.p_tag(run_tags) + xml = DXB.xml(body) + return xml + + +class HyperlinkWithBreakTestCase(_TranslationTestCase): + relationship_dict = { + 'rId0': 'www.google.com', + } + + expected_output = '

        link

        ' + + def get_xml(self): + run_tags = [] + run_tags.append(DXB.r_tag([DXB.t_tag('link')])) + run_tags.append(DXB.r_tag([DXB.linebreak()])) + run_tags = [DXB.hyperlink_tag(r_id='rId0', run_tags=run_tags)] + body = DXB.p_tag(run_tags) + xml = DXB.xml(body) + return xml + + +class ImageLocal(_TranslationTestCase): + relationship_dict = { + 'rId0': 'media/image1.jpeg', + 'rId1': 'media/image2.jpeg', + } + expected_output = ''' +

        +

        + ''' + + def get_xml(self): + drawing = DXB.drawing(height=None, width=None, r_id='rId0') + pict = DXB.pict(height=None, width=None, r_id='rId1') + tags = [ + drawing, + pict, + ] + body = '' + for el in tags: + body += el + + xml = DXB.xml(body) + return xml + + +class ImageTestCase(_TranslationTestCase): + relationship_dict = { + 'rId0': 'media/image1.jpeg', + 'rId1': 'media/image2.jpeg', + } + expected_output = ''' +

        + +

        +

        + +

        + ''' + + def get_xml(self): + drawing = DXB.drawing(height=20, width=40, r_id='rId0') + pict = DXB.pict(height=21, width=41, r_id='rId1') + tags = [ + drawing, + pict, + ] + body = '' + for el in tags: + body += el + + xml = DXB.xml(body) + return xml + + def test_get_image_id(self): + parser = XMLDocx2Html( + document_xml=self.get_xml(), + rels_dict=self.relationship_dict, + ) + tree = parse_xml_from_string(self.get_xml()) + els = [] + els.extend(find_all(tree, 'drawing')) + els.extend(find_all(tree, 'pict')) + image_ids = [] + for el in els: + image_ids.append(parser._get_image_id(el)) + expected = [ + 'rId0', + 'rId1', + ] + self.assertEqual( + set(image_ids), + set(expected), + ) + + def test_get_image_sizes(self): + parser = XMLDocx2Html( + document_xml=self.get_xml(), + rels_dict=self.relationship_dict, + ) + tree = parse_xml_from_string(self.get_xml()) + els = [] + els.extend(find_all(tree, 'drawing')) + els.extend(find_all(tree, 'pict')) + image_ids = [] + for el in els: + image_ids.append(parser._get_image_size(el)) + expected = [ + ('40px', '20px'), + ('41pt', '21pt'), + ] + self.assertEqual( + set(image_ids), + set(expected), + ) + + +class ImageNotInRelsDictTestCase(_TranslationTestCase): + relationship_dict = { + # 'rId0': 'media/image1.jpeg', + } + expected_output = '' + + def get_xml(self): + drawing = DXB.drawing(height=20, width=40, r_id='rId0') + body = drawing + + xml = DXB.xml(body) + return xml + + +class ImageNoSizeTestCase(_TranslationTestCase): + relationship_dict = { + 'rId0': os.path.join( + os.path.abspath(os.path.dirname(__file__)), + '..', + 'fixtures', + 'bullet_go_gray.png', + ) + } + image_sizes = { + 'rId0': (0, 0), + } + expected_output = ''' + +

        + +

        + + ''' % relationship_dict['rId0'] + + @staticmethod + def image_handler(image_id, relationship_dict): + return relationship_dict.get(image_id) + + def get_xml(self): + raise SkipTest( + 'Since we are not using PIL, we do not need this test yet.', + ) + drawing = DXB.drawing('rId0') + tags = [ + drawing, + ] + body = '' + for el in tags: + body += el + + xml = DXB.xml(body) + return xml + + +class TableTag(_TranslationTestCase): + expected_output = ''' + + + + + + + + + +
        AAABBB
        CCCDDD
        + ''' + + def get_xml(self): + cell1 = DXB.table_cell(paragraph=DXB.p_tag('AAA')) + cell2 = DXB.table_cell(paragraph=DXB.p_tag('CCC')) + cell3 = DXB.table_cell(paragraph=DXB.p_tag('BBB')) + cell4 = DXB.table_cell(paragraph=DXB.p_tag('DDD')) + rows = [DXB.table_row([cell1, cell3]), DXB.table_row([cell2, cell4])] + table = DXB.table(rows) + body = table + xml = DXB.xml(body) + return xml + + +class RowSpanTestCase(_TranslationTestCase): + + expected_output = ''' + + + + + + + + +
        AAABBB
        CCC
        + ''' + + def get_xml(self): + cell1 = DXB.table_cell( + paragraph=DXB.p_tag('AAA'), merge=True, merge_continue=False) + cell2 = DXB.table_cell( + paragraph=DXB.p_tag(None), merge=False, merge_continue=True) + cell3 = DXB.table_cell(paragraph=DXB.p_tag('BBB')) + cell4 = DXB.table_cell(paragraph=DXB.p_tag('CCC')) + rows = [DXB.table_row([cell1, cell3]), DXB.table_row([cell2, cell4])] + table = DXB.table(rows) + body = table + xml = DXB.xml(body) + return xml + + +class NestedTableTag(_TranslationTestCase): + expected_output = ''' + + + + + + + + + +
        AAABBB
        CCC + + + + + + + + + +
        DDDEEE
        FFFGGG
        +
        + ''' + + def get_xml(self): + cell1 = DXB.table_cell(paragraph=DXB.p_tag('DDD')) + cell2 = DXB.table_cell(paragraph=DXB.p_tag('FFF')) + cell3 = DXB.table_cell(paragraph=DXB.p_tag('EEE')) + cell4 = DXB.table_cell(paragraph=DXB.p_tag('GGG')) + rows = [DXB.table_row([cell1, cell3]), DXB.table_row([cell2, cell4])] + nested_table = DXB.table(rows) + cell1 = DXB.table_cell(paragraph=DXB.p_tag('AAA')) + cell2 = DXB.table_cell(paragraph=DXB.p_tag('CCC')) + cell3 = DXB.table_cell(paragraph=DXB.p_tag('BBB')) + cell4 = DXB.table_cell(nested_table) + rows = [DXB.table_row([cell1, cell3]), DXB.table_row([cell2, cell4])] + table = DXB.table(rows) + body = table + xml = DXB.xml(body) + return xml + + +class TableWithInvalidTag(_TranslationTestCase): + expected_output = ''' + + + + + + + + + +
        AAABBB
        DDD
        + ''' + + def get_xml(self): + cell1 = DXB.table_cell(paragraph=DXB.p_tag('AAA')) + cell2 = DXB.table_cell('CCC') + cell3 = DXB.table_cell(paragraph=DXB.p_tag('BBB')) + cell4 = DXB.table_cell(paragraph=DXB.p_tag('DDD')) + rows = [DXB.table_row([cell1, cell3]), DXB.table_row([cell2, cell4])] + table = DXB.table(rows) + body = table + xml = DXB.xml(body) + return xml + + +class TableWithListAndParagraph(_TranslationTestCase): + expected_output = ''' + + + + +
        +
          +
        1. AAA
        2. +
        3. BBB
        4. +
        + CCC
        + DDD +
        + ''' + + def get_xml(self): + li_text = [ + ('AAA', 0, 1), + ('BBB', 0, 1), + ] + lis = '' + for text, ilvl, numId in li_text: + lis += DXB.li(text=text, ilvl=ilvl, numId=numId) + els = [ + lis, + DXB.p_tag('CCC'), + DXB.p_tag('DDD'), + ] + td = '' + for el in els: + td += el + cell1 = DXB.table_cell(td) + row = DXB.table_row([cell1]) + table = DXB.table([row]) + body = table + xml = DXB.xml(body) + return xml + + +class SimpleListTestCase(_TranslationTestCase): + expected_output = ''' +
          +
        1. AAA
        2. +
        3. BBB
        4. +
        5. CCC
        6. +
        + ''' + + # Ensure its not failing somewhere and falling back to decimal + numbering_dict = { + '1': { + '0': 'lowerLetter', + } + } + + def get_xml(self): + li_text = [ + ('AAA', 0, 1), + ('BBB', 0, 1), + ('CCC', 0, 1), + ] + lis = '' + for text, ilvl, numId in li_text: + lis += DXB.li(text=text, ilvl=ilvl, numId=numId) + + xml = DXB.xml(lis) + return xml + + +class SingleListItemTestCase(_TranslationTestCase): + expected_output = ''' +
          +
        1. AAA
        2. +
        + ''' + + # Ensure its not failing somewhere and falling back to decimal + numbering_dict = { + '1': { + '0': 'lowerLetter', + } + } + + def get_xml(self): + li_text = [ + ('AAA', 0, 1), + ] + lis = '' + for text, ilvl, numId in li_text: + lis += DXB.li(text=text, ilvl=ilvl, numId=numId) + + xml = DXB.xml(lis) + return xml + + +class ListWithContinuationTestCase(_TranslationTestCase): + expected_output = ''' +
          +
        1. AAA
          BBB
        2. +
        3. CCC + + + + + + + + + +
          DDDEEE
          FFFGGG
          +
        4. +
        5. HHH
        6. +
        + ''' + + def get_xml(self): + cell1 = DXB.table_cell(paragraph=DXB.p_tag('DDD')) + cell2 = DXB.table_cell(paragraph=DXB.p_tag('FFF')) + cell3 = DXB.table_cell(paragraph=DXB.p_tag('EEE')) + cell4 = DXB.table_cell(paragraph=DXB.p_tag('GGG')) + rows = [DXB.table_row([cell1, cell3]), DXB.table_row([cell2, cell4])] + table = DXB.table(rows) + tags = [ + DXB.li(text='AAA', ilvl=0, numId=1), + DXB.p_tag('BBB'), + DXB.li(text='CCC', ilvl=0, numId=1), + table, + DXB.li(text='HHH', ilvl=0, numId=1), + ] + body = '' + for el in tags: + body += el + + xml = DXB.xml(body) + return xml + + +class ListWithMultipleContinuationTestCase(_TranslationTestCase): + expected_output = ''' +
          +
        1. AAA + + + + +
          BBB
          + + + + +
          CCC
          +
        2. +
        3. DDD
        4. +
        + ''' + + def get_xml(self): + cell = DXB.table_cell(paragraph=DXB.p_tag('BBB')) + row = DXB.table_row([cell]) + table1 = DXB.table([row]) + cell = DXB.table_cell(paragraph=DXB.p_tag('CCC')) + row = DXB.table_row([cell]) + table2 = DXB.table([row]) + tags = [ + DXB.li(text='AAA', ilvl=0, numId=1), + table1, + table2, + DXB.li(text='DDD', ilvl=0, numId=1), + ] + body = '' + for el in tags: + body += el + + xml = DXB.xml(body) + return xml + + +class MangledIlvlTestCase(_TranslationTestCase): + expected_output = ''' +
          +
        1. AAA
        2. +
        +
          +
        1. BBB +
            +
          1. CCC
          2. +
          +
        2. +
        + ''' + + def get_xml(self): + li_text = [ + ('AAA', 0, 2), + ('BBB', 1, 1), + ('CCC', 0, 1), + ] + lis = '' + for text, ilvl, numId in li_text: + lis += DXB.li(text=text, ilvl=ilvl, numId=numId) + + xml = DXB.xml(lis) + return xml + + +class SeperateListsTestCase(_TranslationTestCase): + expected_output = ''' +
          +
        1. AAA
        2. +
        +
          +
        1. BBB
        2. +
        +
          +
        1. CCC
        2. +
        + ''' + + def get_xml(self): + li_text = [ + ('AAA', 0, 2), + # Because AAA and CCC are part of the same list (same list id) + # and BBB is different, these need to be split into three + # lists (or lose everything from BBB and after. + ('BBB', 0, 1), + ('CCC', 0, 2), + ] + lis = '' + for text, ilvl, numId in li_text: + lis += DXB.li(text=text, ilvl=ilvl, numId=numId) + + xml = DXB.xml(lis) + return xml + + +class InvalidIlvlOrderTestCase(_TranslationTestCase): + expected_output = ''' +
          +
        1. AAA +
            +
          1. BBB +
              +
            1. CCC
            2. +
            +
          2. +
          +
        2. +
        + ''' + + def get_xml(self): + tags = [ + DXB.li(text='AAA', ilvl=1, numId=1), + DXB.li(text='BBB', ilvl=3, numId=1), + DXB.li(text='CCC', ilvl=2, numId=1), + ] + body = '' + for el in tags: + body += el + + xml = DXB.xml(body) + return xml + + +class DeeplyNestedTableTestCase(_TranslationTestCase): + expected_output = '' + run_expected_output = False + + def get_xml(self): + paragraph = DXB.p_tag('AAA') + + for _ in range(1000): + cell = DXB.table_cell(paragraph) + row = DXB.table_cell([cell]) + table = DXB.table([row]) + body = table + xml = DXB.xml(body) + return xml + + def test_performance(self): + with self.toggle_run_expected_output(): + start_time = time.time() + try: + self.test_expected_output() + except AssertionError: + pass + end_time = time.time() + total_time = end_time - start_time + # This finishes in under a second on python 2.7 + assert total_time < 3, total_time + + +class LargeCellTestCase(_TranslationTestCase): + expected_output = '' + run_expected_output = False + + def get_xml(self): + # Make sure it is over 1000 (which is the recursion limit) + paragraphs = [DXB.p_tag('%d' % i) for i in range(1000)] + cell = DXB.table_cell(paragraphs) + row = DXB.table_cell([cell]) + table = DXB.table([row]) + body = table + xml = DXB.xml(body) + return xml + + def test_performance(self): + with self.toggle_run_expected_output(): + start_time = time.time() + try: + self.test_expected_output() + except AssertionError: + pass + end_time = time.time() + total_time = end_time - start_time + # This finishes in under a second on python 2.7 + assert total_time < 3, total_time + + +class NonStandardTextTagsTestCase(_TranslationTestCase): + expected_output = ''' +

        insert + smarttag

        + ''' + + def get_xml(self): + run_tags = [DXB.r_tag([DXB.t_tag(i)]) for i in 'insert '] + insert_tag = DXB.insert_tag(run_tags) + run_tags = [DXB.r_tag([DXB.t_tag(i)]) for i in 'smarttag'] + smart_tag = DXB.smart_tag(run_tags) + + run_tags = [insert_tag, smart_tag] + body = DXB.p_tag(run_tags) + xml = DXB.xml(body) + return xml + + +class RTagWithNoText(_TranslationTestCase): + expected_output = '' + + def get_xml(self): + p_tag = DXB.p_tag(None) # No text + run_tags = [p_tag] + # The bug is only present in a hyperlink + run_tags = [DXB.hyperlink_tag(r_id='rId0', run_tags=run_tags)] + body = DXB.p_tag(run_tags) + + xml = DXB.xml(body) + return xml + + +class DeleteTagInList(_TranslationTestCase): + expected_output = ''' +
          +
        1. AAA + BBB +
        2. +
        3. CCC
        4. +
        + ''' + + def get_xml(self): + delete_tags = DXB.delete_tag(['BBB']) + p_tag = DXB.p_tag([delete_tags]) + + body = DXB.li(text='AAA', ilvl=0, numId=0) + body += p_tag + body += DXB.li(text='CCC', ilvl=0, numId=0) + + xml = DXB.xml(body) + return xml + + +class InsertTagInList(_TranslationTestCase): + expected_output = ''' +
          +
        1. AAABBB +
        2. +
        3. CCC
        4. +
        + ''' + + def get_xml(self): + run_tags = [DXB.r_tag([DXB.t_tag(i)]) for i in 'BBB'] + insert_tags = DXB.insert_tag(run_tags) + p_tag = DXB.p_tag([insert_tags]) + + body = DXB.li(text='AAA', ilvl=0, numId=0) + body += p_tag + body += DXB.li(text='CCC', ilvl=0, numId=0) + + xml = DXB.xml(body) + return xml + + +class SmartTagInList(_TranslationTestCase): + expected_output = ''' +
          +
        1. AAABBB +
        2. +
        3. CCC
        4. +
        + ''' + + def get_xml(self): + run_tags = [DXB.r_tag([DXB.t_tag(i)]) for i in 'BBB'] + smart_tag = DXB.smart_tag(run_tags) + p_tag = DXB.p_tag([smart_tag]) + + body = DXB.li(text='AAA', ilvl=0, numId=0) + body += p_tag + body += DXB.li(text='CCC', ilvl=0, numId=0) + + xml = DXB.xml(body) + return xml + + +class SingleListItem(_TranslationTestCase): + expected_output = ''' +
          +
        1. AAA
        2. +
        +

        BBB

        + ''' + + numbering_dict = { + '1': { + '0': 'lowerLetter', + } + } + + def get_xml(self): + li = DXB.li(text='AAA', ilvl=0, numId=1) + p_tags = [ + DXB.p_tag('BBB'), + ] + body = li + for p_tag in p_tags: + body += p_tag + xml = DXB.xml(body) + return xml + + +class SimpleTableTest(_TranslationTestCase): + expected_output = ''' + + + + + + + + + + + + + + + + +
        BlankColumn 1Column 2
        Row 1FirstSecond
        Row 2ThirdFourth
        ''' + + def get_xml(self): + cell1 = DXB.table_cell(paragraph=DXB.p_tag('Blank')) + cell2 = DXB.table_cell(paragraph=DXB.p_tag('Row 1')) + cell3 = DXB.table_cell(paragraph=DXB.p_tag('Row 2')) + cell4 = DXB.table_cell(paragraph=DXB.p_tag('Column 1')) + cell5 = DXB.table_cell(paragraph=DXB.p_tag('First')) + cell6 = DXB.table_cell(paragraph=DXB.p_tag('Third')) + cell7 = DXB.table_cell(paragraph=DXB.p_tag('Column 2')) + cell8 = DXB.table_cell(paragraph=DXB.p_tag('Second')) + cell9 = DXB.table_cell(paragraph=DXB.p_tag('Fourth')) + rows = [DXB.table_row([cell1, cell4, cell7]), + DXB.table_row([cell2, cell5, cell8]), + DXB.table_row([cell3, cell6, cell9])] + table = DXB.table(rows) + body = table + xml = DXB.xml(body) + return xml + + +class MissingIlvl(_TranslationTestCase): + expected_output = ''' +
          +
        1. AAA
          + BBB +
        2. +
        3. CCC
        4. +
        + ''' + + def get_xml(self): + li_text = [ + ('AAA', 0, 1), + ('BBB', None, 1), # Because why not. + ('CCC', 0, 1), + ] + lis = '' + for text, ilvl, numId in li_text: + lis += DXB.li(text=text, ilvl=ilvl, numId=numId) + body = lis + xml = DXB.xml(body) + return xml + + +class SameNumIdInTable(_TranslationTestCase): + expected_output = ''' +
          +
        1. AAA + + + + +
          +
            +
          1. BBB
          2. +
          +
          +
        2. +
        3. CCC
        4. +
        + ''' + + # Ensure its not failing somewhere and falling back to decimal + numbering_dict = { + '1': { + '0': 'lowerLetter', + } + } + + def get_xml(self): + li_text = [ + ('BBB', 0, 1), + ] + lis = '' + for text, ilvl, numId in li_text: + lis += DXB.li(text=text, ilvl=ilvl, numId=numId) + cell1 = DXB.table_cell(lis) + rows = DXB.table_row([cell1]) + table = DXB.table([rows]) + lis = '' + lis += DXB.li(text='AAA', ilvl=0, numId=1) + lis += table + lis += DXB.li(text='CCC', ilvl=0, numId=1) + body = lis + xml = DXB.xml(body) + return xml + + +class SDTTestCase(_TranslationTestCase): + expected_output = ''' +
          +
        1. AAABBB +
        2. +
        3. CCC
        4. +
        + ''' + + def get_xml(self): + body = '' + body += DXB.li(text='AAA', ilvl=0, numId=0) + body += DXB.sdt_tag(p_tag=DXB.p_tag(text='BBB')) + body += DXB.li(text='CCC', ilvl=0, numId=0) + + xml = DXB.xml(body) + return xml + + +class HeadingTestCase(_TranslationTestCase): + expected_output = ''' +

        AAA

        +

        BBB

        +

        CCC

        +

        DDD

        +
        EEE
        +
        GGG
        +

        HHH

        + ''' + + styles_dict = { + 'style0': { + 'style_name': 'heading 1', + }, + 'style1': { + 'style_name': 'heading 2', + }, + 'style2': { + 'style_name': 'heading 3', + }, + 'style3': { + 'style_name': 'heading 4', + }, + 'style4': { + 'style_name': 'heading 5', + }, + 'style5': { + 'style_name': 'heading 6', + }, + } + + def get_xml(self): + p_tags = [ + DXB.p_tag(text='AAA', style='style0'), + DXB.p_tag(text='BBB', style='style1'), + DXB.p_tag(text='CCC', style='style2'), + DXB.p_tag(text='DDD', style='style3'), + DXB.p_tag(text='EEE', style='style4'), + DXB.p_tag(text='GGG', style='style5'), + DXB.p_tag(text='HHH', style='garbage'), + ] + body = '' + for tag in p_tags: + body += tag + + xml = DXB.xml(body) + return xml + + +class StyledBoldingTestCase(_TranslationTestCase): + expected_output = ''' +

        AAA

        +

        BBB

        +

        CCC

        + ''' + + styles_dict = { + 'style0': { + 'style_name': 'p1', + 'default_run_properties': { + 'b': '', + } + }, + } + + def get_xml(self): + p_tags = [ + DXB.p_tag(text='AAA', style='style0'), + DXB.p_tag( + [ + DXB.r_tag( + [DXB.t_tag('BBB')], + # Don't do duplicates + rpr=DXB.rpr_tag({'b': None}), + ), + ], + style='style0', + ), + DXB.p_tag( + [ + DXB.r_tag( + [DXB.t_tag('CCC')], + # Overwrite the current style + rpr=DXB.rpr_tag({'b': 'false'}), + ), + ], + style='style0', + ), + ] + body = '' + for tag in p_tags: + body += tag + + xml = DXB.xml(body) + return xml + + +class RomanNumeralToHeadingTestCase(_TranslationTestCase): + convert_root_level_upper_roman = True + numbering_dict = { + '1': { + '0': 'upperRoman', + '1': 'decimal', + '2': 'upperRoman', + }, + '2': { + '0': 'upperRoman', + '1': 'decimal', + '2': 'upperRoman', + }, + '3': { + '0': 'upperRoman', + '1': 'decimal', + '2': 'upperRoman', + }, + } + expected_output = ''' +

        AAA

        +
          +
        1. BBB
        2. +
        +

        CCC

        +
          +
        1. DDD
        2. +
        +

        EEE

        +
          +
        1. FFF +
            +
          1. GGG
          2. +
          +
        2. +
        + ''' + + def get_xml(self): + li_text = [ + ('AAA', 0, 1), + ('BBB', 1, 1), + ('CCC', 0, 2), + ('DDD', 1, 2), + ('EEE', 0, 3), + ('FFF', 1, 3), + ('GGG', 2, 3), + ] + body = '' + for text, ilvl, numId in li_text: + body += DXB.li(text=text, ilvl=ilvl, numId=numId) + + xml = DXB.xml(body) + return xml + + +class MultipleTTagsInRTag(_TranslationTestCase): + expected_output = ''' +

        ABC

        + ''' + + def get_xml(self): + r_tag = DXB.r_tag( + [DXB.t_tag(letter) for letter in 'ABC'], + ) + p_tag = DXB.p_tag( + [r_tag], + jc='start', + ) + body = p_tag + + xml = DXB.xml(body) + return xml + + +class SuperAndSubScripts(_TranslationTestCase): + expected_output = ''' +

        AAABBB

        +

        CCCDDD

        + ''' + + def get_xml(self): + p_tags = [ + DXB.p_tag( + [ + DXB.r_tag([DXB.t_tag('AAA')]), + DXB.r_tag( + [DXB.t_tag('BBB')], + rpr=DXB.rpr_tag({'vertAlign': 'superscript'}), + ), + ], + ), + DXB.p_tag( + [ + DXB.r_tag( + [DXB.t_tag('CCC')], + rpr=DXB.rpr_tag({'vertAlign': 'subscript'}), + ), + DXB.r_tag([DXB.t_tag('DDD')]), + ], + ), + ] + body = '' + for p_tag in p_tags: + body += p_tag + + xml = DXB.xml(body) + return xml + + +class AvaliableInlineTags(_TranslationTestCase): + expected_output = ''' +

        aaa

        +

        bbb

        +

        ccc

        +

        ddd

        +

        eee

        +

        fff

        +

        ggg

        +

        hhh

        +

        iii

        +

        jjj

        + ''' + + def get_xml(self): + p_tags = [ + DXB.p_tag( + [ + DXB.r_tag( + [DXB.t_tag('aaa')], + rpr=DXB.rpr_tag({'b': None}), + ), + ], + ), + DXB.p_tag( + [ + DXB.r_tag( + [DXB.t_tag('bbb')], + rpr=DXB.rpr_tag({'u': None}), + ), + ], + ), + DXB.p_tag( + [ + DXB.r_tag( + [DXB.t_tag('ccc')], + rpr=DXB.rpr_tag({'i': None}), + ), + ], + ), + DXB.p_tag( + [ + DXB.r_tag( + [DXB.t_tag('ddd')], + rpr=DXB.rpr_tag({'caps': None}), + ), + ], + ), + DXB.p_tag( + [ + DXB.r_tag( + [DXB.t_tag('eee')], + rpr=DXB.rpr_tag({'smallCaps': None}), + ), + ], + ), + DXB.p_tag( + [ + DXB.r_tag( + [DXB.t_tag('fff')], + rpr=DXB.rpr_tag({'strike': None}) + ), + ], + ), + DXB.p_tag( + [ + DXB.r_tag( + [DXB.t_tag('ggg')], + rpr=DXB.rpr_tag({'dstrike': None}), + ), + ], + ), + DXB.p_tag( + [ + DXB.r_tag( + [DXB.t_tag('hhh')], + rpr=DXB.rpr_tag({'vanish': None}) + ), + ], + ), + DXB.p_tag( + [ + DXB.r_tag( + [DXB.t_tag('iii')], + rpr=DXB.rpr_tag({'webHidden': None}), + ), + ], + ), + DXB.p_tag( + [ + DXB.r_tag( + [DXB.t_tag('jjj')], + rpr=DXB.rpr_tag({'vertAlign': 'superscript'}), + ), + ], + ), + ] + body = '' + for p_tag in p_tags: + body += p_tag + + xml = DXB.xml(body) + return xml + + +class UnicodeTestCase(_TranslationTestCase): + expected_output = u""" +

        \U0010001f

        + """ + + def get_xml(self): + tags = [ + DXB.p_tag( + [ + DXB.r_tag( + [DXB.t_tag(r'􀀟')], + ), + ], + ), + ] + + body = '' + for tag in tags: + body += tag + xml = DXB.xml(body) + return xml.encode('utf-8') + + +class NoTextInTTagTestCase(_TranslationTestCase): + expected_output = u""" + """ + + def get_xml(self): + tags = [ + DXB.p_tag( + [ + DXB.r_tag( + [DXB.t_tag(None)], + ), + ], + ), + ] + + body = '' + for tag in tags: + body += tag + xml = DXB.xml(body) + return xml.encode('utf-8') diff --git a/pydocx/utils.py b/pydocx/utils.py new file mode 100644 index 00000000..1323302b --- /dev/null +++ b/pydocx/utils.py @@ -0,0 +1,495 @@ +import re +import collections + +from collections import defaultdict +from xml.etree import cElementTree + +from pydocx.exceptions import MalformedDocxException + + +UPPER_ROMAN_TO_HEADING_VALUE = 'h2' +TAGS_CONTAINING_CONTENT = ( + 't', + 'pict', + 'drawing', + 'delText', + 'ins', +) +TAGS_HOLDING_CONTENT_TAGS = ( + 'p', + 'tbl', + 'sdt', +) + + +class MulitMemoize(object): + ''' + Adapted from: https://wiki.python.org/moin/PythonDecoratorLibrary#Memoize + func_names = { + 'find_all': find_all, + ... + } + ''' + def __init__(self, func_names): + self.cache = dict((func_name, {}) for func_name in func_names) + self.func_names = func_names + + def __call__(self, func_name, *args): + if not isinstance(args, collections.Hashable): + # uncacheable. a list, for instance. + # better to not cache than blow up. + return self.func_names[func_name](*args) + if args in self.cache[func_name]: + return self.cache[func_name][args] + else: + value = self.func_names[func_name](*args) + self.cache[func_name][args] = value + return value + + +class MulitMemoizeMixin(object): + def __init__(self, *args, **kwargs): + super(MulitMemoizeMixin, self).__init__(*args, **kwargs) + self._memoization = None + + def memod_tree_op(self, func_name, *args): + return self._memoization(func_name, *args) + + def populate_memoization(self, func_names): + self._memoization = MulitMemoize(func_names) + + +def el_iter(el): + """ + Go through all elements + """ + try: + return el.iter() + except AttributeError: + return el.findall('.//*') + + +def find_first(el, tag): + """ + Find the first occurrence of a tag beneath the current element. + """ + return el.find('.//' + tag) + + +def find_all(el, tag): + """ + Find all occurrences of a tag + """ + return el.findall('.//' + tag) + + +def find_ancestor_with_tag(pre_processor, el, tag): + """ + Find the first ancestor with that is a `tag`. + """ + while pre_processor.parent(el) is not None: + el = pre_processor.parent(el) + if el.tag == tag: + return el + return None + + +def has_descendant_with_tag(el, tag): + """ + Determine if there is a child ahead in the element tree. + """ + # Get child. stop at first child. + return True if find_first(el, tag) is not None else False + + +def _filter_children(element, tags): + return [ + el for el in element.getchildren() + if el.tag in tags + ] + + +def remove_namespaces(document): + """ + >>> exception_raised = False + >>> try: + ... remove_namespaces('junk') + ... except MalformedDocxException: + ... exception_raised = True + >>> assert exception_raised + """ + encoding_regex = re.compile( + r'<\?xml.*encoding="(.+?)"', + re.DOTALL | re.MULTILINE, + ) + encoding = 'us-ascii' + m = encoding_regex.match(document) + if m: + encoding = m.groups(0)[0] + try: + root = cElementTree.fromstring(document) + except SyntaxError: + raise MalformedDocxException('This document cannot be converted.') + for child in el_iter(root): + child.tag = child.tag.split("}")[1] + child.attrib = dict( + (k.split("}")[-1], v) + for k, v in child.attrib.items() + ) + return cElementTree.tostring(root, encoding=encoding) + + +def get_list_style(numbering_root, num_id, ilvl): + # This is needed on both the custom lxml parser and the pydocx parser. So + # make it a function. + ids = find_all(numbering_root, 'num') + for _id in ids: + if _id.attrib['numId'] != num_id: + continue + abstractid = _id.find('abstractNumId') + abstractid = abstractid.attrib['val'] + style_information = find_all( + numbering_root, + 'abstractNum', + ) + for info in style_information: + if info.attrib['abstractNumId'] == abstractid: + for i in el_iter(info): + if ( + 'ilvl' in i.attrib and + i.attrib['ilvl'] != ilvl): + continue + if i.find('numFmt') is not None: + return i.find('numFmt').attrib['val'] + + +class NamespacedNumId(object): + def __init__(self, num_id, num_tables, *args, **kwargs): + self._num_id = num_id + self._num_tables = num_tables + + def __unicode__(self, *args, **kwargs): + return '%s:%d' % ( + self._num_id, + self._num_tables, + ) + + def __repr__(self, *args, **kwargs): + return self.__unicode__(*args, **kwargs) + + def __eq__(self, other): + if other is None: + return False + return repr(self) == repr(other) + + def __ne__(self, other): + if other is None: + return False + return repr(self) != repr(other) + + @property + def num_id(self): + return self._num_id + + +class PydocxPreProcessor(MulitMemoizeMixin): + def __init__( + self, + convert_root_level_upper_roman=False, + styles_dict=None, + numbering_root=None, + *args, **kwargs): + self.meta_data = defaultdict(dict) + self.convert_root_level_upper_roman = convert_root_level_upper_roman + self.styles_dict = styles_dict + self.numbering_root = numbering_root + + def perform_pre_processing(self, root, *args, **kwargs): + self.populate_memoization({ + 'find_first': find_first, + }) + self._add_parent(root) + # If we don't have a numbering root there cannot be any lists. + if self.numbering_root is not None: + self._set_list_attributes(root) + self._set_table_attributes(root) + self._set_is_in_table(root) + + body = find_first(root, 'body') + p_elements = [ + child for child in find_all(body, 'p') + ] + list_elements = [ + child for child in p_elements + if self.is_list_item(child) + ] + # Find the first and last li elements + num_ids = set([self.num_id(i) for i in list_elements]) + ilvls = set([self.ilvl(i) for i in list_elements]) + self._set_first_list_item(num_ids, ilvls, list_elements) + self._set_last_list_item(num_ids, list_elements) + + self._set_headers(p_elements) + self._convert_upper_roman(body) + self._set_next(body) + + def is_first_list_item(self, el): + return self.meta_data[el].get('is_first_list_item', False) + + def is_last_list_item_in_root(self, el): + return self.meta_data[el].get('is_last_list_item_in_root', False) + + def is_list_item(self, el): + return self.meta_data[el].get('is_list_item', False) + + def num_id(self, el): + if not self.is_list_item(el): + return None + return self.meta_data[el].get('num_id') + + def ilvl(self, el): + if not self.is_list_item(el): + return None + return self.meta_data[el].get('ilvl') + + def heading_level(self, el): + return self.meta_data[el].get('heading_level') + + def is_in_table(self, el): + return self.meta_data[el].get('is_in_table') + + def row_index(self, el): + return self.meta_data[el].get('row_index') + + def column_index(self, el): + return self.meta_data[el].get('column_index') + + def vmerge_continue(self, el): + return self.meta_data[el].get('vmerge_continue') + + def next(self, el): + if el not in self.meta_data: + return + return self.meta_data[el].get('next') + + def previous(self, el): + if el not in self.meta_data: + return + return self.meta_data[el].get('previous') + + def parent(self, el): + return self.meta_data[el].get('parent') + + def _add_parent(self, el): # if a parent, make that an attribute + for child in el.getchildren(): + self.meta_data[child]['parent'] = el + self._add_parent(child) + + def _set_list_attributes(self, el): + list_elements = find_all(el, 'numId') + for li in list_elements: + parent = find_ancestor_with_tag(self, li, 'p') + # Deleted text in a list will have a numId but no ilvl. + if parent is None: + continue + parent_ilvl = self.memod_tree_op('find_first', parent, 'ilvl') + if parent_ilvl is None: + continue + self.meta_data[parent]['is_list_item'] = True + self.meta_data[parent]['num_id'] = self._generate_num_id(parent) + self.meta_data[parent]['ilvl'] = parent_ilvl.attrib['val'] + + def _generate_num_id(self, el): + ''' + Fun fact: It is possible to have a list in the root, that holds a table + that holds a list and for both lists to have the same numId. When this + happens we should namespace the nested list with the number of tables + it is in to ensure it is considered a new list. Otherwise all sorts of + terrible html gets generated. + ''' + num_id = find_first(el, 'numId').attrib['val'] + + # First, go up the parent until we get None and count the number of + # tables there are. + num_tables = 0 + while self.parent(el) is not None: + if el.tag == 'tbl': + num_tables += 1 + el = self.parent(el) + return NamespacedNumId( + num_id=num_id, + num_tables=num_tables, + ) + + def _set_first_list_item(self, num_ids, ilvls, list_elements): + # Lists are grouped by having the same `num_id` and `ilvl`. The first + # list item is the first list item found for each `num_id` and `ilvl` + # combination. + for num_id in num_ids: + for ilvl in ilvls: + filtered_list_elements = [ + i for i in list_elements + if ( + self.num_id(i) == num_id and + self.ilvl(i) == ilvl + ) + ] + if not filtered_list_elements: + continue + first_el = filtered_list_elements[0] + self.meta_data[first_el]['is_first_list_item'] = True + + def _set_last_list_item(self, num_ids, list_elements): + # Find last list elements. Only mark list tags as the last list tag if + # it is in the root of the document. This is only used to ensure that + # once a root level list is finished we do not roll in the rest of the + # non list elements into the first root level list. + for num_id in num_ids: + filtered_list_elements = [ + i for i in list_elements + if self.num_id(i) == num_id + ] + if not filtered_list_elements: + continue + last_el = filtered_list_elements[-1] + self.meta_data[last_el]['is_last_list_item_in_root'] = True + + def _set_table_attributes(self, el): + tables = find_all(el, 'tbl') + for table in tables: + rows = _filter_children(table, ['tr']) + if rows is None: + continue + for i, row in enumerate(rows): + tcs = _filter_children(row, ['tc']) + for j, child in enumerate(tcs): + self.meta_data[child]['row_index'] = i + self.meta_data[child]['column_index'] = j + v_merge = find_first(child, 'vMerge') + if ( + v_merge is not None and + ('continue' == v_merge.get('val', '') or + v_merge.attrib == {}) + ): + self.meta_data[child]['vmerge_continue'] = True + + def _set_is_in_table(self, el): + paragraph_elements = find_all(el, 'p') + for p in paragraph_elements: + if find_ancestor_with_tag(self, p, 'tc') is not None: + self.meta_data[p]['is_in_table'] = True + + def _set_headers(self, elements): + # These are the styles for headers and what the html tag should be if + # we have one. + headers = { + 'heading 1': 'h1', + 'heading 2': 'h2', + 'heading 3': 'h3', + 'heading 4': 'h4', + 'heading 5': 'h5', + 'heading 6': 'h6', + 'heading 7': 'h6', + 'heading 8': 'h6', + 'heading 9': 'h6', + 'heading 10': 'h6', + } + # Remove the rPr from the styles dict since all the styling will be + # down with the heading. + for style_id, styles in self.styles_dict.items(): + if styles.get('style_name', '').lower() in headers: + if 'default_run_properties' in styles: + del styles['default_run_properties'] + + for element in elements: + # This element is using the default style which is not a heading. + p_style = find_first(element, 'pStyle') + if p_style is None: + continue + style = p_style.attrib.get('val', '') + metadata = self.styles_dict.get(style, {}) + style_name = metadata.get('style_name') + + # Check to see if this element is actually a header. + if style_name and style_name.lower() in headers: + # Set all the list item variables to false. + self.meta_data[element]['is_list_item'] = False + self.meta_data[element]['is_first_list_item'] = False + self.meta_data[element]['is_last_list_item_in_root'] = False + # Prime the heading_level + self.meta_data[element]['heading_level'] = headers[style_name.lower()] # noqa + + def _convert_upper_roman(self, body): + if not self.convert_root_level_upper_roman: + return + first_root_list_items = [ + # Only root level elements. + el for el in body.getchildren() + # And only first_list_items + if self.is_first_list_item(el) + ] + visited_num_ids = [] + all_p_tags_in_body = find_all(body, 'p') + for root_list_item in first_root_list_items: + if self.num_id(root_list_item) in visited_num_ids: + continue + visited_num_ids.append(self.num_id(root_list_item)) + lst_style = get_list_style( + self.numbering_root, + self.num_id(root_list_item).num_id, + self.ilvl(root_list_item), + ) + if lst_style != 'upperRoman': + continue + ilvl = min( + self.ilvl(el) for el in all_p_tags_in_body + if self.num_id(el) == self.num_id(root_list_item) + ) + root_upper_roman_list_items = [ + el for el in all_p_tags_in_body + if self.num_id(el) == self.num_id(root_list_item) and + self.ilvl(el) == ilvl + ] + for list_item in root_upper_roman_list_items: + self.meta_data[list_item]['is_list_item'] = False + self.meta_data[list_item]['is_first_list_item'] = False + self.meta_data[list_item]['is_last_list_item_in_root'] = False # noqa + + self.meta_data[list_item]['heading_level'] = UPPER_ROMAN_TO_HEADING_VALUE # noqa + + def _set_next(self, body): + def _get_children_with_content(el): + # We only care about children if they have text in them. + children = [] + for child in _filter_children(el, TAGS_HOLDING_CONTENT_TAGS): + _has_descendant_with_tag = any( + has_descendant_with_tag(child, tag) for + tag in TAGS_CONTAINING_CONTENT + ) + if _has_descendant_with_tag: + children.append(child) + return children + + def _assign_next(children): + # Populate the `next` attribute for all the child elements. + for i in range(len(children)): + try: + if children[i + 1] is not None: + self.meta_data[children[i]]['next'] = children[i + 1] # noqa + except IndexError: + pass + try: + if children[i - 1] is not None: + self.meta_data[children[i]]['previous'] = children[i - 1] # noqa + except IndexError: + pass + # Assign next for everything in the root. + _assign_next(_get_children_with_content(body)) + + # In addition set next for everything in table cells. + for tc in find_all(body, 'tc'): + _assign_next(_get_children_with_content(tc)) + + +def parse_xml_from_string(xml): + return cElementTree.fromstring(remove_namespaces(xml)) diff --git a/requirements.txt b/requirements.txt index f9954ad0..77421ff8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1,4 @@ -beautifulsoup4>=4.1.0 +Jinja2>=2.0 +coverage==3.6 +nose==1.3.0 +flake8 diff --git a/run_tests.sh b/run_tests.sh new file mode 100755 index 00000000..da46b811 --- /dev/null +++ b/run_tests.sh @@ -0,0 +1,4 @@ +#! /bin/sh + +nosetests --verbose --with-doctest --with-coverage --cover-package pydocx $@ && +find -name '*.py' | xargs flake8 diff --git a/setup.py b/setup.py new file mode 100644 index 00000000..defe5013 --- /dev/null +++ b/setup.py @@ -0,0 +1,62 @@ +#! /usr/bin/env python +# -*- coding: utf-8 -*- + +import os + +try: + from setuptools import setup, find_packages +except ImportError: + from ez_setup import use_setuptools + use_setuptools() + from setuptools import setup, find_packages # noqa +rel_file = lambda *args: os.path.join( + os.path.dirname(os.path.abspath(__file__)), *args) + + +def get_file(filename): + with open(rel_file(filename)) as f: + return f.read() + + +def get_description(): + return get_file('README.rst') + get_file('CHANGELOG') + +setup( + name="PyDocX", + # Edit here and pydocx.__init__ + version="0.3.13", + description="docx (OOXML) to html converter", + author="Jason Ward, Sam Portnow", + author_email="jason.louard.ward@gmail.com, samson91787@gmail.com", + url="http://github.com/CenterForOpenScience/pydocx", + platforms=["any"], + license="BSD", + packages=find_packages(), + package_data={ + 'pydocx': [ + 'tests/templates/*.xml', + ], + }, + scripts=[], + zip_safe=False, + install_requires=[], + cmdclass={}, + classifiers=[ + "Development Status :: 3 - Alpha", + "Programming Language :: Python", + "Programming Language :: Python :: 2.6", + "Programming Language :: Python :: 2.7", + "Programming Language :: Python :: 2 :: Only", + "Intended Audience :: Developers", + "License :: OSI Approved :: BSD License", + "Operating System :: OS Independent", + "Topic :: Text Processing :: Markup :: HTML", + "Topic :: Text Processing :: Markup :: XML", + ], + long_description=get_description(), + entry_points={ + 'console_scripts': [ + 'pydocx = pydocx.__init__:main', + ], + }, +)