diff --git a/.gitignore b/.gitignore
index ce7a7cef..5a57b80d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -37,3 +37,8 @@ pip-log.txt
 nosetests.xml
 *.mo
 .idea
+
+test.html
+testxml.html
+
+main.py
diff --git a/.travis.yml b/.travis.yml
index 6a5babb4..4251ba15 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -2,9 +2,13 @@ language: python
 python:
   - "2.6"
   - "2.7"
-script: python main.py
+script: ./run_tests.sh
 install:
+  - python setup.py -q install
   - pip install -r requirements.txt
+env:
+  - TRAVIS_EXECUTE_PERFORMANCE=1
 notifications:
   email:
     - jason.louard.ward@gmail.com
+    - samson91787@gmail.com
diff --git a/AUTHORS b/AUTHORS
new file mode 100644
index 00000000..81a14d38
--- /dev/null
+++ b/AUTHORS
@@ -0,0 +1,2 @@
+Sam Protnow <samson91787@gmail.com>
+Jason Ward <jason.louard.ward@gmail.com>
diff --git a/CHANGELOG b/CHANGELOG
new file mode 100644
index 00000000..d40440c9
--- /dev/null
+++ b/CHANGELOG
@@ -0,0 +1,74 @@
+
+Changelog
+=========
+* 0.3.13
+    * Significant performance gains for documents with a large number of table
+      cells.
+    * Significant performance gains for large documents.
+* 0.3.12
+    * Added command line support to convert from docx to either html or
+      markdown.
+* 0.3.11
+    * The non breaking hyphen tag was not correctly being imported. This issue
+      has been fixed.
+* 0.3.10
+    * Found and optimized a fairly large performance issue with tables that had
+      large amounts of content within a single cell, which includes nested
+      tables.
+* 0.3.9
+    * We are now respecting the `<w:tab/>` element. We are putting a space in
+      everywhere they happen.
+    * Each styling can have a default defined based on values in `styles.xml`.
+      These default styles can be overwritten using the `rPr` on the actual `r`
+      tag. These default styles defined in `styles.xml` are actually being
+      respected now.
+* 0.3.8
+    * If zipfile fails to open the passed in file, we are now raising a
+      `MalformedDocxException` instead of a `BadZipFIle`.
+* 0.3.7
+    * Some inline tags (most notably the underline tag) could have a `val` of
+      `none` and that would signify that the style is disabled. A `val` of
+      `none` is now correctly handled.
+* 0.3.6
+    * It is possible for a docx file to not contain a `numbering.xml` file but
+      still try to use lists. Now if this happens all lists get converted to
+      paragraphs.
+* 0.3.5
+    * Not all docx files contain a `styles.xml` file. We are no longer assuming
+      they do.
+* 0.3.4
+    * It is possible for `w:t` tags to have `text` set to `None`. This no
+      longer causes an error when escaping that text.
+* 0.3.3
+    * In the event that `cElementTree` has a problem parsing the document, a
+      `MalformedDocxException` is raised instead of a `SyntaxError`
+* 0.3.2
+    * We were not taking into account that vertical merges should have a
+      continue attribute, but sometimes they do not, and in those cases word
+      assumes the continue attribute. We updated the parser to handle the
+      cases in which the continue attribute is not there.
+    * We now correctly handle documents with unicode character in the
+      namespace.
+    * In rare cases, some text would be output with a style when it should not
+      have been. This issue has been fixed.
+* 0.3.1
+    * Added support for several more OOXML tags including:
+        * caps
+        * smallCaps
+        * strike
+        * dstrike
+        * vanish
+        * webHidden
+      More details in the README.
+* 0.3.0
+    * We switched from using stock *xml.etree.ElementTree* to using
+      *xml.etree.cElementTree*. This has resulted in a fairly significant speed
+      increase for python 2.6
+    * It is now possible to create your own pre processor to do additional pre
+      processing.
+    * Superscripts and subscripts are now extracted correctly.
+* 0.2.1
+    * Added a changelog
+    * Added the version in pydocx.__init__
+    * Fixed an issue with duplicating content if there was indentation or
+      justification on a p element that had multiple t tags.
diff --git a/MANIFEST.in b/MANIFEST.in
new file mode 100644
index 00000000..88fbbf67
--- /dev/null
+++ b/MANIFEST.in
@@ -0,0 +1,7 @@
+include AUTHORS
+include CHANGELOG
+include LICENSE
+include MANIFEST.in
+include README.rst
+include pydocx/fixtures/*
+include pydocx/tests/templates/*
diff --git a/README.md b/README.md
deleted file mode 100644
index e3773551..00000000
--- a/README.md
+++ /dev/null
@@ -1,2 +0,0 @@
-pydocx
-======
\ No newline at end of file
diff --git a/README.rst b/README.rst
new file mode 100644
index 00000000..1bb9b3b1
--- /dev/null
+++ b/README.rst
@@ -0,0 +1,238 @@
+======
+pydocx
+======
+.. image:: https://travis-ci.org/CenterForOpenScience/pydocx.png?branch=master
+   :align: left
+   :target: https://travis-ci.org/CenterForOpenScience/pydocx
+
+pydocx is a parser that breaks down the elements of a docxfile and converts them
+into different markup languages. Right now, HTML is supported. Markdown and LaTex
+will be available soon. You can extend any of the available parsers to customize it
+to your needs. You can also create your own class that inherits DocxParser
+to create your own methods for a markup language not yet supported.
+
+Currently Supported
+###################
+
+* tables
+    * nested tables
+    * rowspans
+    * colspans
+    * lists in tables
+* lists
+    * list styles
+    * nested lists
+    * list of tables
+    * list of pragraphs
+* justification
+* images
+* styles
+    * bold
+    * italics
+    * underline
+    * hyperlinks
+* headings
+
+Usage
+#####
+
+DocxParser includes abstracts methods that each parser overwrites to satsify its own needs. The abstract methods are as follows:
+
+::
+
+    class DocxParser:
+
+        @property
+        def parsed(self):
+            return self._parsed
+
+        @property
+        def escape(self, text):
+            return text
+
+        @abstractmethod
+        def linebreak(self):
+            return ''
+
+        @abstractmethod
+        def paragraph(self, text):
+            return text
+
+        @abstractmethod
+        def heading(self, text, heading_level):
+            return text
+
+        @abstractmethod
+        def insertion(self, text, author, date):
+            return text
+
+        @abstractmethod
+        def hyperlink(self, text, href):
+            return text
+
+        @abstractmethod
+        def image_handler(self, path):
+            return path
+
+        @abstractmethod
+        def image(self, path, x, y):
+            return self.image_handler(path)
+
+        @abstractmethod
+        def deletion(self, text, author, date):
+            return text
+
+        @abstractmethod
+        def bold(self, text):
+            return text
+
+        @abstractmethod
+        def italics(self, text):
+            return text
+
+        @abstractmethod
+        def underline(self, text):
+            return text
+
+        @abstractmethod
+        def superscript(self, text):
+            return text
+
+        @abstractmethod
+        def subscript(self, text):
+            return text
+
+        @abstractmethod
+        def tab(self):
+            return True
+
+        @abstractmethod
+        def ordered_list(self, text):
+            return text
+
+        @abstractmethod
+        def unordered_list(self, text):
+            return text
+
+        @abstractmethod
+        def list_element(self, text):
+            return text
+
+        @abstractmethod
+        def table(self, text):
+            return text 
+        @abstractmethod
+        def table_row(self, text):
+            return text
+
+        @abstractmethod
+        def table_cell(self, text):
+            return text
+
+        @abstractmethod
+        def page_break(self):
+            return True
+
+        @abstractmethod
+        def indent(self, text, left='', right='', firstLine=''):
+            return text
+
+Docx2Html inherits DocxParser and implements basic HTML handling. Ex.
+
+::
+
+    class Docx2Html(DocxParser):
+
+        #  Escape '&', '<', and '>' so we render the HTML correctly
+        def escape(self, text):
+            return xml.sax.saxutils.quoteattr(text)[1:-1]
+
+        # return a line break
+        def linebreak(self, pre=None):
+            return '<br />'
+
+        # add paragraph tags
+        def paragraph(self, text, pre=None):
+            return '<p>' + text + '</p>'
+
+
+However, let's say you want to add a specific style to your HTML document. In order to do this, you want to make each paragraph a class of type `my_implementation`. Simply extend docx2Html and add what you need.
+
+::
+
+     class My_Implementation_of_Docx2Html(Docx2Html):
+
+        def paragraph(self, text, pre = None):
+            return <p class="my_implementation"> + text + '</p>'
+
+
+
+OR, let's say FOO is your new favorite markup language. Simply customize your own new parser, overwritting the abstract methods of DocxParser
+
+::
+
+    class Docx2Foo(DocxParser):
+
+        # because linebreaks in are denoted by '!!!!!!!!!!!!' with the FOO markup langauge  :)
+        def linebreak(self):
+            return '!!!!!!!!!!!!'
+
+Custom Pre-Processor
+####################
+
+When creating your own Parser (as described above) you can now add in your own custom Pre Processor. To do so you will need to set the `pre_processor` field on the custom parser, like so:
+
+::
+
+    class Docx2Foo(DocxParser):
+        pre_processor_class = FooPreProcessor
+
+
+The `FooPreProcessor` will need a few things to get you going:
+
+::
+
+    class FooPreProcessor(PydocxPreProcessor):
+        def perform_pre_processing(self, root, *args, **kwargs):
+            super(FooPreProcessor, self).perform_pre_processing(root, *args, **kwargs)
+            self._set_foo(root)
+
+        def _set_foo(self, root):
+            pass
+
+If you want `_set_foo` to be called you must add it to `perform_pre_processing` which is called in the base parser for pydocx.
+
+Everything done during pre-processing is executed prior to `parse` being called for the first time.
+
+
+Styles
+######
+
+The base parser `Docx2Html` relies on certain css class being set for certain behaviour to occur. Currently these include:
+
+* class `pydocx-insert` -> Turns the text green.
+* class `pydocx-delete` -> Turns the text red and draws a line through the text.
+* class `pydocx-center` -> Aligns the text to the center.
+* class `pydocx-right` -> Aligns the text to the right.
+* class `pydocx-left` -> Aligns the text to the left.
+* class `pydocx-comment` -> Turns the text blue.
+* class `pydocx-underline` -> Underlines the text.
+* class `pydocx-caps` -> Makes all text uppercase.
+* class `pydocx-small-caps` -> Makes all text uppercase, however truly lowercase letters will be small than their uppercase counterparts.
+* class `pydocx-strike` -> Strike a line through.
+* class `pydocx-hidden` -> Hide the text.
+
+Exceptions
+##########
+
+Right now there is only one custom exception (`MalformedDocxException`). It is raised if either the `xml` or `zipfile` libraries raise an exception.
+
+Optional Arguments
+##################
+
+You can pass in `convert_root_level_upper_roman=True` to the parser and it will convert all root level upper roman lists to headings instead.
+
+Command Line Execution
+######################
+
+First you have to install pydocx, this can be done by running the command `pip install pydocx`. From there you can simply call the command `pydocx --html path/to/file.docx path/to/output.html`. Change `pydocx --html` to `pydocx --markdown` in order to convert to markdown instead.
diff --git a/main.py b/main.py
deleted file mode 100644
index c9e8e1d4..00000000
--- a/main.py
+++ /dev/null
@@ -1,12 +0,0 @@
-from pydocx import *
-from bs4 import BeautifulSoup
-import xml.etree.ElementTree as ElementTree
-#import lxml.etree as etree
-
-with open('test.html', 'w') as f:
-    f.write(docx2html('helloworld.docx'))
-with open('testxml.html','w') as f:
-    f.write(BeautifulSoup(ElementTree.tostring(Docx2Html('helloworld.docx').root)).prettify())
-
-#print docx2html('helloworld.docx')
-#print docx2markdown('helloworld.docx')
\ No newline at end of file
diff --git a/pydocx/DocxParser.py b/pydocx/DocxParser.py
index b3006ef0..fb08b180 100644
--- a/pydocx/DocxParser.py
+++ b/pydocx/DocxParser.py
@@ -1,323 +1,715 @@
-from abc import abstractmethod, ABCMeta
-import zipfile
 import logging
-import xml.etree.ElementTree as ElementTree
-from xml.etree.ElementTree import _ElementInterface
+import os
+import zipfile
+
+from abc import abstractmethod, ABCMeta
+from contextlib import contextmanager
+
+from pydocx.utils import (
+    MulitMemoizeMixin,
+    PydocxPreProcessor,
+    find_all,
+    find_ancestor_with_tag,
+    find_first,
+    get_list_style,
+    has_descendant_with_tag,
+    parse_xml_from_string,
+)
+from pydocx.exceptions import MalformedDocxException
 
 logging.basicConfig(level=logging.DEBUG)
 logger = logging.getLogger("NewParser")
 
 
-def remove_namespaces(document):
-    root = ElementTree.fromstring(document)
-    for child in el_iter(root):
-        child.tag = child.tag.split("}")[1]
-        child.attrib = dict(
-            (k.split("}")[1], v)
-            for k, v in child.attrib.items()
-        )
-    return ElementTree.tostring(root)
-
-# Add some helper functions to Element to make it slightly more readable
-
-
-def has_child(self, tag):
-    return True if self.find(tag) is not None else False
-
-
-def has_child_all(self, tag):
-    return True if self.find('.//' + tag) is not None else False
+# http://openxmldeveloper.org/discussions/formats/f/15/p/396/933.aspx
+EMUS_PER_PIXEL = 9525
+USE_ALIGNMENTS = True
 
+JUSTIFY_CENTER = 'center'
+JUSTIFY_LEFT = 'left'
+JUSTIFY_RIGHT = 'right'
 
-def find_all(self, tag):
-    return self.find('.//' + tag)
+INDENTATION_RIGHT = 'right'
+INDENTATION_LEFT = 'left'
+INDENTATION_FIRST_LINE = 'firstLine'
+DISABLED_STYLE_VALUES = ['false', '0', 'none']
 
-
-def findall_all(self, tag):
-    return self.findall('.//' + tag)
+# Add some helper functions to Element to make it slightly more readable
 
 
-def el_iter(el):
+@contextmanager
+def ZipFile(path):  # This is not needed in python 3.2+
     try:
-        return el.iter()
-    except AttributeError:
-        return el.findall('.//*')
-
-
-setattr(_ElementInterface, 'has_child', has_child)
-setattr(_ElementInterface, 'has_child_all', has_child_all)
-setattr(_ElementInterface, 'find_all', find_all)
-setattr(_ElementInterface, 'findall_all', findall_all)
-setattr(_ElementInterface, 'parent', None)
-setattr(_ElementInterface, 'parent_list', [])
-
-# End helpers
+        f = zipfile.ZipFile(path)
+    except zipfile.BadZipfile:
+        raise MalformedDocxException('Passed in document is not a docx')
+    yield f
+    f.close()
 
 
-class DocxParser:
+class DocxParser(MulitMemoizeMixin):
     __metaclass__ = ABCMeta
+    pre_processor_class = PydocxPreProcessor
 
-    def __init__(self, path):
-        self._parsed = ''
-        self.in_list = False
-
-        f = zipfile.ZipFile(path)
+    def _extract_xml(self, f, xml_path):
         try:
-            self.document_text = f.read('word/document.xml')
-            try:
-                self.numbering_text = f.read('word/numbering.xml')
-            except zipfile.BadZipfile:
-                pass
-            try:
-                self.comment_text = f.read('word/comments.xml')
-            except zipfile.BadZipfile:
-                pass
-        finally:
-            f.close()
-
-        self.root = ElementTree.fromstring(
-            remove_namespaces(self.document_text),
-        )
-
-        def add_parent(el):
-            for child in el.getchildren():
-                setattr(child, 'parent', el)
-                add_parent(child)
-        add_parent(self.root)
-
-        def create_parent_list(el, tmp=None):
-            if tmp is None:
-                tmp = []
-            for child in el:
-                tmp.append(el)
-                tmp = create_parent_list(child, tmp)
-            el.parent_list = tmp[:]
-            try:
-                tmp.pop()
-            except:
-                tmp = []
-            return tmp
-
-        create_parent_list(self.root)
+            return f.read(xml_path)
+        except KeyError:
+            return None
 
+    def _build_data(self, path, *args, **kwargs):
+        with ZipFile(path) as f:
+            # These must be in the ZIP in order for the docx to be valid.
+            self.document_text = f.read('word/document.xml')
+            self.relationship_text = f.read('word/_rels/document.xml.rels')
+
+            # These are all optional.
+            self.styles_text = self._extract_xml(f, 'word/styles.xml')
+            self.fonts = self._extract_xml(f, 'word/fontTable.xml')
+            self.numbering_text = self._extract_xml(f, 'word/numbering.xml')
+            self.comment_text = self._extract_xml(f, 'word/comments.xml')
+
+            zipped_image_files = [
+                e for e in f.infolist()
+                if e.filename.startswith('word/media/')
+            ]
+            for e in zipped_image_files:
+                self._image_data[e.filename] = f.read(e.filename)
+
+        self.root = parse_xml_from_string(self.document_text)
+        self.numbering_root = None
+        if self.numbering_text:
+            self.numbering_root = parse_xml_from_string(self.numbering_text)
+        self.comment_root = None
+        if self.comment_text:
+            self.comment_root = parse_xml_from_string(self.comment_text)
+
+    def _parse_run_properties(self, rPr):
+        """
+        Takes an `rPr` and returns a dictionary contain the tag name mapped to
+        the child's value property.
+
+        If you have an rPr that looks like this:
+        <w:rPr>
+            <w:b/>
+            <w:u val="false"/>
+            <w:sz val="16"/>
+        </w:rPr>
+
+        That will result in a dictionary that looks like this:
+        {
+            'b': '',
+            'u': 'false',
+            'sz': '16',
+        }
+        """
+        run_properties = {}
+        if rPr is None:
+            return {}
+        for run_property in rPr:
+            val = run_property.get('val', '').lower()
+            run_properties[run_property.tag] = val
+        return run_properties
+
+    def _parse_styles(self):
+        if self.styles_text is None:
+            return {}
+        tree = parse_xml_from_string(self.styles_text)
+        styles_dict = {}
+        for style in find_all(tree, 'style'):
+            style_val = find_first(style, 'name').attrib['val']
+            run_properties = find_first(style, 'rPr')
+            styles_dict[style.attrib['styleId']] = {
+                'style_name': style_val,
+                'default_run_properties': self._parse_run_properties(
+                    run_properties,
+                ),
+            }
+        return styles_dict
+
+    def _parse_rels_root(self):
+        tree = parse_xml_from_string(self.relationship_text)
+        rels_dict = {}
+        for el in tree:
+            rId = el.get('Id')
+            target = el.get('Target')
+            rels_dict[rId] = target
+        return rels_dict
+
+    def __init__(
+            self,
+            path,
+            convert_root_level_upper_roman=False,
+            *args,
+            **kwargs):
+        self._parsed = ''
+        self.block_text = ''
+        self.page_width = 0
+        self.convert_root_level_upper_roman = convert_root_level_upper_roman
+        self._image_data = {}
+        self._build_data(path, *args, **kwargs)
+        self.pre_processor = None
+
+        #divide by 20 to get to pt (Office works in 20th's of a point)
+        """
+        see http://msdn.microsoft.com/en-us/library/documentformat
+        .openxml.wordprocessing.indentation.aspx
+        """
+        if find_first(self.root, 'pgSz') is not None:
+            self.page_width = int(
+                find_first(self.root, 'pgSz').attrib['w']
+            ) / 20
+
+        #all blank when we init
         self.comment_store = None
-        self.numbering_store = None
-        self.ignore_current = False
-        self.elements = []
-        self.tables_seen = []
-        self.visited = []
-        try:
-            self.numbering_root = ElementTree.fromstring(
-                remove_namespaces(self.numbering_text),
-            )
-        except:
-            pass
-        self.parse_begin(self.root)
+        self.visited = set()
+        self.list_depth = 0
+        self.rels_dict = self._parse_rels_root()
+        self.styles_dict = self._parse_styles()
+        self.parse_begin(self.root)  # begin to parse
 
     def parse_begin(self, el):
-        self._parsed += self.parse_lists(el)
-
-### parse table function and is_table flag
-    def parse_lists(self, el):
-        parsed = ''
-        first_p = el.find_all('p')
-        children = []
-        for child in first_p.parent:
-            if child.tag == 'p' or child.tag == 'tbl':
-                children.append(child)
-        p_list = children
-        list_started = False
-        list_type = ''
-        list_chunks = []
-        index_start = 0
-        index_end = 1
-        for i, el in enumerate(p_list):
-            if not list_started and el.has_child_all('ilvl'):
-                list_started = True
-                list_type = self.get_list_style(
-                    el.find_all('numId').attrib['val'],
-                )
-                list_chunks.append(p_list[index_start:index_end])
-                index_start = i
-                index_end = i+1
-            elif (
-                    list_started and
-                    el.has_child_all('ilvl') and
-                    not list_type == self.get_list_style(
-                        el.find_all('numId').attrib['val']
-                    )):
-                list_type = self.get_list_style(
-                    el.find_all('numId').attrib['val'],
-                )
-                list_started = True
-                list_chunks.append(p_list[index_start:index_end])
-                index_start = i
-                index_end = i+1
-            elif list_started and not el.has_child_all('ilvl'):
-                list_started = False
-                list_chunks.append(p_list[index_start:index_end])
-                index_start = i
-                index_end = i+1
-            else:
-                index_end = i+1
-        list_chunks.append(p_list[index_start:index_end])
-        for chunk in list_chunks:
-            chunk_parsed = ''
-            for el in chunk:
-                chunk_parsed += self.parse(el)
-            if chunk[0].has_child_all('ilvl'):
-                lst_style = self.get_list_style(
-                    chunk[0].find_all('numId').attrib['val'],
-                )
-                if lst_style['val'] == 'bullet':
-                    parsed += self.unordered_list(chunk_parsed)
-                else:
-                    parsed += self.ordered_list(chunk_parsed)
-            elif chunk[0].has_child_all('br'):
-                parsed += self.page_break()
-            else:
-                parsed += chunk_parsed
-
-        return parsed
+        self.populate_memoization({
+            'find_all': find_all,
+            'find_first': find_first,
+            'has_descendant_with_tag': has_descendant_with_tag,
+            '_get_tcs_in_column': self._get_tcs_in_column,
+        })
+        self.pre_processor = self.pre_processor_class(
+            convert_root_level_upper_roman=self.convert_root_level_upper_roman,
+            styles_dict=self.styles_dict,
+            numbering_root=self.numbering_root,
+        )
+        self.pre_processor.perform_pre_processing(el)
+        self._parsed += self.parse(el)
 
     def parse(self, el):
+        if el in self.visited:
+            return ''
+        self.visited.add(el)
         parsed = ''
-        if not self.ignore_current:
-            tmp_d = dict(
-                (tmpel.tag, i)
-                for i, tmpel in enumerate(el.parent_list)
-            )
-            if (
-                    'tbl' in tmp_d and
-                    el.parent_list[tmp_d['tbl']] not in self.tables_seen):
-                self.ignore_current = True
-                self.tables_seen.append(el.parent_list[tmp_d['tbl']])
-                tmpout = self.table(self.parse(el.parent_list[tmp_d['tbl']]))
-                self.ignore_current = False
-                return tmpout
-
         for child in el:
+            # recursive. So you can get all the way to the bottom
             parsed += self.parse(child)
-
-        if el.tag == 'br' and el.attrib['type'] == 'page':
-            #TODO figure out what parsed is getting overwritten
-            return self.page_break()
-        # add it to the list so we don't repeat!
-        if el.tag == 'ilvl' and el not in self.visited:
-            self.in_list = True
-            self.visited.append(el)
-            ## This starts the returns
+        if el.tag == 'br' and el.attrib.get('type') == 'page':
+            return self.parse_page_break(el, parsed)
+        # page breaks use lastRenderedPageBreak in MS Word > 2007
+        elif el.tag == 'lastRenderedPageBreak':
+            return self.parse_page_break(el, parsed)
+        elif el.tag == 'tbl':
+            return self.parse_table(el, parsed)
         elif el.tag == 'tr':
-            return self.table_row(parsed)
+            return self.parse_table_row(el, parsed)
         elif el.tag == 'tc':
-            self.elements.append(el)
-            return self.table_cell(parsed)
-        if el.tag == 'r' and el not in self.elements:
-            self.elements.append(el)
-            return self.parse_r(el)
+            return self.parse_table_cell(el, parsed)
+        elif el.tag == 'r':
+            return self.parse_r(el, parsed)
+        elif el.tag == 't':
+            return self.parse_t(el, parsed)
+        elif el.tag == 'tab':
+            return self.parse_tab(el, parsed)
+        elif el.tag == 'noBreakHyphen':
+            return self.parse_hyphen(el, parsed)
+        elif el.tag == 'br':
+            return self.parse_break_tag(el, parsed)
+        elif el.tag == 'delText':
+            return self.parse_deletion(el, parsed)
         elif el.tag == 'p':
             return self.parse_p(el, parsed)
         elif el.tag == 'ins':
-            return self.insertion(parsed, '', '')
+            return self.parse_insertion(el, parsed)
+        elif el.tag == 'hyperlink':
+            return self.parse_hyperlink(el, parsed)
+        elif el.tag in ('pict', 'drawing'):
+            return self.parse_image(el)
+        else:
+            return parsed
+
+    def parse_page_break(self, el, text):
+        #TODO figure out what parsed is getting overwritten
+        return self.page_break()
+
+    def parse_table(self, el, text):
+        return self.table(text)
+
+    def parse_table_row(self, el, text):
+        return self.table_row(text)
+
+    def parse_table_cell(self, el, text):
+        v_merge = find_first(el, 'vMerge')
+        if v_merge is not None and (
+                'restart' != v_merge.get('val', '')):
+            return ''
+        colspan = self.get_colspan(el)
+        rowspan = self._get_rowspan(el, v_merge)
+        if rowspan > 1:
+            rowspan = str(rowspan)
+        else:
+            rowspan = ''
+        return self.table_cell(text, colspan, rowspan)
+
+    def parse_list(self, el, text):
+        """
+        All the meat of building the list is done in _parse_list, however we
+        call this method for two reasons: It is the naming convention we are
+        following. And we need a reliable way to raise and lower the list_depth
+        (which is used to determine if we are in a list). I could have done
+        this in _parse_list, however it seemed cleaner to do it here.
+        """
+        self.list_depth += 1
+        parsed = self._parse_list(el, text)
+        self.list_depth -= 1
+        if self.pre_processor.is_in_table(el):
+            return self.parse_table_cell_contents(el, parsed)
+        return parsed
+
+    def get_list_style(self, num_id, ilvl):
+        return get_list_style(self.numbering_root, num_id, ilvl)
+
+    def _build_list(self, el, text):
+        # Get the list style for the pending list.
+        lst_style = self.get_list_style(
+            self.pre_processor.num_id(el).num_id,
+            self.pre_processor.ilvl(el),
+        )
+
+        parsed = text
+        # Create the actual list and return it.
+        if lst_style == 'bullet':
+            return self.unordered_list(parsed)
         else:
+            return self.ordered_list(
+                parsed,
+                lst_style,
+            )
+
+    def _parse_list(self, el, text):
+        parsed = self.parse_list_item(el, text)
+        num_id = self.pre_processor.num_id(el)
+        ilvl = self.pre_processor.ilvl(el)
+        # Everything after this point assumes the first element is not also the
+        # last. If the first element is also the last then early return by
+        # building and returning the completed list.
+        if self.pre_processor.is_last_list_item_in_root(el):
+            return self._build_list(el, parsed)
+        next_el = self.pre_processor.next(el)
+
+        def is_same_list(next_el, num_id, ilvl):
+            # Bail if next_el is not an element
+            if next_el is None:
+                return False
+            if self.pre_processor.is_last_list_item_in_root(next_el):
+                return False
+            # If next_el is not a list item then roll it into the list by
+            # returning True.
+            if not self.pre_processor.is_list_item(next_el):
+                return True
+            if self.pre_processor.num_id(next_el) != num_id:
+                # The next element is a new list entirely
+                return False
+            if self.pre_processor.ilvl(next_el) < ilvl:
+                # The next element is de-indented, so this is really the last
+                # element in the list
+                return False
+            return True
+
+        while is_same_list(next_el, num_id, ilvl):
+            if next_el in self.visited:
+                # Early continue for elements we have already visited.
+                next_el = self.pre_processor.next(next_el)
+                continue
+
+            if self.pre_processor.is_list_item(next_el):
+                # Reset the ilvl
+                ilvl = self.pre_processor.ilvl(next_el)
+
+            parsed += self.parse(next_el)
+            next_el = self.pre_processor.next(next_el)
+
+        def should_parse_last_el(last_el, first_el):
+            if last_el is None:
+                return False
+            # Different list
+            if (
+                    self.pre_processor.num_id(last_el) !=
+                    self.pre_processor.num_id(first_el)):
+                return False
+            # Will be handled when the ilvls do match (nesting issue)
+            if (
+                    self.pre_processor.ilvl(last_el) !=
+                    self.pre_processor.ilvl(first_el)):
+                return False
+            # We only care about last items that have not been parsed before
+            # (first list items are always parsed at the beginning of this
+            # method.)
+            return (
+                not self.pre_processor.is_first_list_item(last_el) and
+                self.pre_processor.is_last_list_item_in_root(last_el)
+            )
+        if should_parse_last_el(next_el, el):
+            parsed += self.parse(next_el)
+
+        # If the list has no content, then we don't need to worry about the
+        # list styling, because it will be stripped out.
+        if parsed == '':
             return parsed
 
+        return self._build_list(el, parsed)
+
+    def justification(self, el, text):
+        paragraph_tag_property = el.find('pPr')
+        if paragraph_tag_property is None:
+            return text
+
+        _justification = paragraph_tag_property.find('jc')
+        indentation = paragraph_tag_property.find('ind')
+        if _justification is None and indentation is None:
+            return text
+        alignment = None
+        right = None
+        left = None
+        firstLine = None
+        if _justification is not None:  # text alignments
+            value = _justification.attrib['val']
+            if value in [JUSTIFY_LEFT, JUSTIFY_CENTER, JUSTIFY_RIGHT]:
+                alignment = value
+
+        if indentation is not None:
+            if INDENTATION_RIGHT in indentation.attrib:
+                right = indentation.attrib[INDENTATION_RIGHT]
+                # divide by 20 to get to pt. multiply by (4/3) to get to px
+                right = (int(right) / 20) * float(4) / float(3)
+                right = str(right)
+            if INDENTATION_LEFT in indentation.attrib:
+                left = indentation.attrib[INDENTATION_LEFT]
+                left = (int(left) / 20) * float(4) / float(3)
+                left = str(left)
+            if INDENTATION_FIRST_LINE in indentation.attrib:
+                firstLine = indentation.attrib[INDENTATION_FIRST_LINE]
+                firstLine = (int(firstLine) / 20) * float(4) / float(3)
+                firstLine = str(firstLine)
+        if any([alignment, firstLine, left, right]):
+            return self.indent(text, alignment, firstLine, left, right)
+        return text
+
     def parse_p(self, el, text):
+        if text == '':
+            return ''
+        # TODO This is still not correct, however it fixes the bug. We need to
+        # apply the classes/styles on p, td, li and h tags instead of inline,
+        # but that is for another ticket.
+        text = self.justification(el, text)
+        if self.pre_processor.is_first_list_item(el):
+            return self.parse_list(el, text)
+        if self.pre_processor.heading_level(el):
+            return self.parse_heading(el, text)
+        if self.pre_processor.is_list_item(el):
+            return self.parse_list_item(el, text)
+        if self.pre_processor.is_in_table(el):
+            return self.parse_table_cell_contents(el, text)
         parsed = text
-        if self.in_list:
-            self.in_list = False
-            parsed = self.list_element(parsed)
-        elif (
-                not el.has_child_all('t') and
-                'tbl' not in [i.tag for i in el.parent_list]):
-            parsed = self.linebreak()
-        elif el.parent not in self.elements:
+        # No p tags in li tags
+        if self.list_depth == 0:
             parsed = self.paragraph(parsed)
         return parsed
 
-    def parse_r(self, el):
-        is_deleted = False
-        text = None
-        if el.has_child('t'):
-            text = self.escape(el.find('t').text)
-        elif el.has_child('delText'):
-            text = self.escape(el.find('delText').text)
-            is_deleted = True
-        if text:
-            rpr = el.find('rPr')
-            if rpr is not None:
-                fns = []
-                if rpr.has_child('b'):
-                    fns.append(self.bold)
-                if rpr.has_child('i'):
-                    fns.append(self.italics)
-                if rpr.has_child('u'):
-                    fns.append(self.underline)
-                for fn in fns:
-                    text = fn(text)
-            ppr = el.parent.find('pPr')
-            if ppr is not None:
-                jc = ppr.find('jc')
-                if jc is not None:
-                    if jc.attrib['val'] == 'right':
-                        text = self.right_justify(text)
-                    if jc.attrib['val'] == 'center':
-                        text = self.center_justify(text)
-                ind = ppr.find('ind')
-                if ind is not None:
-                    right = None
-                    left = None
-                    firstLine = None
-                    if 'right' in ind.attrib:
-                        right = ind.attrib['right']
-                        right = int(right)/20
-                        right = str(right)
-                    if 'left' in ind.attrib:
-                        left = ind.attrib['left']
-                        left = int(left)/20
-                        left = str(left)
-                    if 'firstLine' in ind.attrib:
-                        firstLine = ind.attrib['firstLine']
-                        firstLine = int(firstLine)/20
-                        firstLine = str(firstLine)
-                    text = self.indent(text, right, left, firstLine)
-            if is_deleted:
-                text = self.deletion(text, '', '')
+    def _should_append_break_tag(self, next_el):
+        paragraph_like_tags = [
+            'p',
+        ]
+        inline_like_tags = [
+            'smartTag',
+            'ins',
+            'delText',
+        ]
+        if self.pre_processor.is_list_item(next_el):
+            return False
+        if self.pre_processor.previous(next_el) is None:
+            return False
+        tag_is_inline_like = any(
+            self.memod_tree_op('has_descendant_with_tag', next_el, tag) for
+            tag in inline_like_tags
+        )
+        if tag_is_inline_like:
+            return False
+        if (
+                self.pre_processor.is_last_list_item_in_root(
+                    self.pre_processor.previous(next_el))):
+            return False
+        if self.pre_processor.previous(next_el).tag not in paragraph_like_tags:
+            return False
+        if next_el.tag not in paragraph_like_tags:
+            return False
+        return True
+
+    def parse_heading(self, el, parsed):
+        return self.heading(parsed, self.pre_processor.heading_level(el))
+
+    def parse_list_item(self, el, text):
+        # If for whatever reason we are not currently in a list, then start
+        # a list here. This will only happen if the num_id/ilvl combinations
+        # between lists is not well formed.
+        parsed = text
+        if self.list_depth == 0:
+            return self.parse_list(el, parsed)
+
+        def _should_parse_next_as_content(el):
+            """
+            Get the contents of the next el and append it to the
+            contents of the current el (that way things like tables
+            are actually in the li tag instead of in the ol/ul tag).
+            """
+            next_el = self.pre_processor.next(el)
+            if next_el is None:
+                return False
+            if (
+                    not self.pre_processor.is_list_item(next_el) and
+                    not self.pre_processor.is_last_list_item_in_root(el)
+            ):
+                return True
+            if self.pre_processor.is_first_list_item(next_el):
+                if (
+                        self.pre_processor.num_id(next_el) ==
+                        self.pre_processor.num_id(el)):
+                    return True
+            return False
+
+        while el is not None:
+            if _should_parse_next_as_content(el):
+                el = self.pre_processor.next(el)
+                next_elements_content = self.parse(el)
+                if not next_elements_content:
+                    continue
+                if self._should_append_break_tag(el):
+                    parsed += self.break_tag()
+                parsed += next_elements_content
+            else:
+                break
+        # Create the actual li element
+        return self.list_element(parsed)
+
+    def _get_tcs_in_column(self, tbl, column_index):
+        return [
+            tc for tc in self.memod_tree_op('find_all', tbl, 'tc')
+            if self.pre_processor.column_index(tc) == column_index
+        ]
+
+    def _get_rowspan(self, el, v_merge):
+        restart_in_v_merge = False
+        if v_merge is not None and 'val' in v_merge.attrib:
+            restart_in_v_merge = 'restart' in v_merge.attrib['val']
+
+        if not restart_in_v_merge:
+            return ''
+
+        current_row = self.pre_processor.row_index(el)
+        current_col = self.pre_processor.column_index(el)
+        rowspan = 1
+        result = ''
+        tbl = find_ancestor_with_tag(self.pre_processor, el, 'tbl')
+        # We only want table cells that have a higher row_index that is greater
+        # than the current_row and that are on the current_col
+        if tbl is None:
+            return ''
+
+        tcs = [
+            tc for tc in self.memod_tree_op(
+                '_get_tcs_in_column', tbl, current_col,
+            ) if self.pre_processor.row_index(tc) >= current_row
+        ]
+
+        def should_increment_rowspan(tc):
+            if not self.pre_processor.vmerge_continue(tc):
+                return False
+            return True
+
+        for tc in tcs:
+            if should_increment_rowspan(tc):
+                rowspan += 1
+            else:
+                rowspan = 1
+            if rowspan > 1:
+                result = rowspan
+        return str(result)
+
+    def get_colspan(self, el):
+        grid_span = find_first(el, 'gridSpan')
+        if grid_span is None:
+            return ''
+        return grid_span.attrib['val']
+
+    def parse_table_cell_contents(self, el, text):
+        parsed = text
+
+        next_el = self.pre_processor.next(el)
+        if next_el is not None:
+            if self._should_append_break_tag(next_el):
+                parsed += self.break_tag()
+        return parsed
+
+    def parse_hyperlink(self, el, text):
+        rId = el.get('id')
+        href = self.rels_dict.get(rId)
+        if not href:
             return text
-        else:
+        href = self.escape(href)
+        return self.hyperlink(text, href)
+
+    def _get_image_id(self, el):
+        # Drawings
+        blip = find_first(el, 'blip')
+        if blip is not None:
+            # On drawing tags the id is actually whatever is returned from the
+            # embed attribute on the blip tag. Thanks a lot Microsoft.
+            return blip.get('embed')
+        # Picts
+        imagedata = find_first(el, 'imagedata')
+        if imagedata is not None:
+            return imagedata.get('id')
+
+    def _convert_image_size(self, size):
+        return size / EMUS_PER_PIXEL
+
+    def _get_image_size(self, el):
+        """
+        If we can't find a height or width, return 0 for whichever is not
+        found, then rely on the `image` handler to strip those attributes. This
+        functionality can change once we integrate PIL.
+        """
+        sizes = find_first(el, 'ext')
+        if sizes is not None and sizes.get('cx'):
+            if sizes.get('cx'):
+                x = self._convert_image_size(int(sizes.get('cx')))
+            if sizes.get('cy'):
+                y = self._convert_image_size(int(sizes.get('cy')))
+            return (
+                '%dpx' % x,
+                '%dpx' % y,
+            )
+        shape = find_first(el, 'shape')
+        if shape is not None and shape.get('style') is not None:
+            # If either of these are not set, rely on the method `image` to not
+            # use either of them.
+            x = 0
+            y = 0
+            styles = shape.get('style').split(';')
+
+            for s in styles:
+                if s.startswith('height:'):
+                    y = s.split(':')[1]
+                if s.startswith('width:'):
+                    x = s.split(':')[1]
+            return x, y
+        return 0, 0
+
+    def parse_image(self, el):
+        x, y = self._get_image_size(el)
+        rId = self._get_image_id(el)
+        src = self.rels_dict.get(rId)
+        if not src:
+            return ''
+        src = os.path.join(
+            'word',
+            src,
+        )
+        if src in self._image_data:
+            filename = os.path.split(src)[-1]
+            return self.image(self._image_data[src], filename, x, y)
+        return ''
+
+    def _is_style_on(self, value):
+        """
+        For b, i, u (bold, italics, and underline) merely having the tag is not
+        sufficient. You need to check to make sure it is not set to "false" as
+        well.
+        """
+        return value not in DISABLED_STYLE_VALUES
+
+    def parse_t(self, el, parsed):
+        if el.text is None:
+            return ''
+        return self.escape(el.text)
+
+    def parse_tab(self, el, parsed):
+        return self.tab()
+
+    def parse_hyphen(self, el, parsed):
+        return '-'
+
+    def parse_break_tag(self, el, parsed):
+        return self.break_tag()
+
+    def parse_deletion(self, el, parsed):
+        if el.text is None:
             return ''
+        return self.deletion(el.text, '', '')
 
-    def get_list_style(self, numval):
-        ids = self.numbering_root.findall_all('num')
-        for _id in ids:
-            if _id.attrib['numId'] == numval:
-                abstractid = _id.find('abstractNumId')
-                abstractid = abstractid.attrib['val']
-                style_information = self.numbering_root.findall_all(
-                    'abstractNum',
-                )
-                for info in style_information:
-                    if info.attrib['abstractNumId'] == abstractid:
-                        for i in el_iter(info):
-                            if i.find('numFmt') is not None:
-                                return i.find('numFmt').attrib
-
-    def get_comments(self, doc_id):
-        if self.comment_store is None:
-            # TODO throw appropriate error
-            comment_root = ElementTree.fromstring(
-                remove_namespaces(self.comment_text),
+    def parse_insertion(self, el, parsed):
+        return self.insertion(parsed, '', '')
+
+    def parse_r(self, el, parsed):
+        """
+        Parse the running text.
+        """
+        text = parsed
+        if not text:
+            return ''
+
+        run_properties = {}
+
+        # Get the rPr for the current style, they are the defaults.
+        p = find_ancestor_with_tag(self.pre_processor, el, 'p')
+        paragraph_style = self.memod_tree_op('find_first', p, 'pStyle')
+        if paragraph_style is not None:
+            style = paragraph_style.get('val')
+            style_defaults = self.styles_dict.get(style, {})
+            run_properties.update(
+                style_defaults.get('default_run_properties', {}),
             )
-            ids_and_info = {}
-            ids = comment_root.findall_all('comment')
-            for _id in ids:
-                ids_and_info[_id.attrib['id']] = {
-                    "author": _id.attrib['author'],
-                    "date": _id.attrib['date'],
-                    "text": _id.findall_all('t')[0].text,
-                }
-            self.comment_store = ids_and_info
-        return self.comment_store[doc_id]
+
+        # Get the rPr for the current r tag, they are overrides.
+        run_properties_element = el.find('rPr')
+        if run_properties_element:
+            local_run_properties = self._parse_run_properties(
+                run_properties_element,
+            )
+            run_properties.update(local_run_properties)
+
+        inline_tag_handlers = {
+            'b': self.bold,
+            'i': self.italics,
+            'u': self.underline,
+            'caps': self.caps,
+            'smallCaps': self.small_caps,
+            'strike': self.strike,
+            'dstrike': self.strike,
+            'vanish': self.hide,
+            'webHidden': self.hide,
+        }
+        styles_needing_application = []
+        for property_name, property_value in run_properties.items():
+            # These tags are a little different, handle them separately
+            # from the rest.
+            # This could be a superscript or a subscript
+            if property_name == 'vertAlign':
+                if property_value == 'superscript':
+                    styles_needing_application.append(self.superscript)
+                elif property_value == 'subscript':
+                    styles_needing_application.append(self.subscript)
+            else:
+                if (
+                        property_name in inline_tag_handlers and
+                        self._is_style_on(property_value)
+                ):
+                    styles_needing_application.append(
+                        inline_tag_handlers[property_name],
+                    )
+
+        # Apply all the handlers.
+        for func in styles_needing_application:
+            text = func(text)
+
+        return text
 
     @property
     def parsed(self):
@@ -335,10 +727,26 @@ def linebreak(self):
     def paragraph(self, text):
         return text
 
+    @abstractmethod
+    def heading(self, text, heading_level):
+        return text
+
     @abstractmethod
     def insertion(self, text, author, date):
         return text
 
+    @abstractmethod
+    def hyperlink(self, text, href):
+        return text
+
+    @abstractmethod
+    def image_handler(self, path):
+        return path
+
+    @abstractmethod
+    def image(self, data, filename, x, y):
+        return self.image_handler(data)
+
     @abstractmethod
     def deletion(self, text, author, date):
         return text
@@ -355,6 +763,30 @@ def italics(self, text):
     def underline(self, text):
         return text
 
+    @abstractmethod
+    def caps(self, text):
+        return text
+
+    @abstractmethod
+    def small_caps(self, text):
+        return text
+
+    @abstractmethod
+    def strike(self, text):
+        return text
+
+    @abstractmethod
+    def hide(self, text):
+        return text
+
+    @abstractmethod
+    def superscript(self, text):
+        return text
+
+    @abstractmethod
+    def subscript(self, text):
+        return text
+
     @abstractmethod
     def tab(self):
         return True
@@ -388,15 +820,5 @@ def page_break(self):
         return True
 
     @abstractmethod
-    def right_justify(self, text):
-        return text
-
-    @abstractmethod
-    def center_justify(self, text):
-        return text
-
-    @abstractmethod
-    def indent(self, text, left=None, right=None, firstLine=None):
-        return text
-
-    #TODO JUSTIFIED JUSTIFIED TEXT
+    def indent(self, text, left='', right='', firstLine=''):
+        return text  # TODO JUSTIFIED JUSTIFIED TEXT
diff --git a/pydocx/__init__.py b/pydocx/__init__.py
index 9b42e00f..e59babb6 100644
--- a/pydocx/__init__.py
+++ b/pydocx/__init__.py
@@ -1,8 +1,34 @@
-from .parsers import *
+import sys
+from .parsers import Docx2Html, Docx2Markdown
+
 
 def docx2html(path):
     return Docx2Html(path).parsed
 
+
 def docx2markdown(path):
     return Docx2Markdown(path).parsed
 
+VERSION = '0.3.13'
+
+
+def main():
+    try:
+        parser_to_use = sys.argv[1]
+        path_to_docx = sys.argv[2]
+        path_to_html = sys.argv[3]
+    except IndexError:
+        print 'Must specify which parser as well as the file to convert and the name of the resulting file.'  # noqa
+        sys.exit()
+    if parser_to_use == '--html':
+        html = Docx2Html(path_to_docx).parsed
+    elif parser_to_use == '--markdown':
+        html = Docx2Markdown(path_to_docx).parsed
+    else:
+        print 'Only valid parsers are --html and --markdown'
+        sys.exit()
+    with open(path_to_html, 'w') as f:
+        f.write(html.encode('utf-8'))
+
+if __name__ == '__main__':
+    main()
diff --git a/pydocx/exceptions.py b/pydocx/exceptions.py
new file mode 100644
index 00000000..cdff556a
--- /dev/null
+++ b/pydocx/exceptions.py
@@ -0,0 +1,2 @@
+class MalformedDocxException(Exception):
+    pass
diff --git a/pydocx/fixtures/all_configured_styles.docx b/pydocx/fixtures/all_configured_styles.docx
new file mode 100644
index 00000000..8f514372
Binary files /dev/null and b/pydocx/fixtures/all_configured_styles.docx differ
diff --git a/pydocx/fixtures/attachment_is_tiff.docx b/pydocx/fixtures/attachment_is_tiff.docx
new file mode 100644
index 00000000..774362ca
Binary files /dev/null and b/pydocx/fixtures/attachment_is_tiff.docx differ
diff --git a/pydocx/fixtures/bigger_font_size_to_header.docx b/pydocx/fixtures/bigger_font_size_to_header.docx
new file mode 100644
index 00000000..c722888b
Binary files /dev/null and b/pydocx/fixtures/bigger_font_size_to_header.docx differ
diff --git a/pydocx/fixtures/convert_p_to_h.docx b/pydocx/fixtures/convert_p_to_h.docx
new file mode 100644
index 00000000..53769e15
Binary files /dev/null and b/pydocx/fixtures/convert_p_to_h.docx differ
diff --git a/pydocx/fixtures/fake_headings_by_length.docx b/pydocx/fixtures/fake_headings_by_length.docx
new file mode 100644
index 00000000..a130f5ba
Binary files /dev/null and b/pydocx/fixtures/fake_headings_by_length.docx differ
diff --git a/pydocx/fixtures/greek_alphabet.docx b/pydocx/fixtures/greek_alphabet.docx
new file mode 100644
index 00000000..46ab5429
Binary files /dev/null and b/pydocx/fixtures/greek_alphabet.docx differ
diff --git a/pydocx/fixtures/has_image.docx b/pydocx/fixtures/has_image.docx
new file mode 100644
index 00000000..2ebd0bd0
Binary files /dev/null and b/pydocx/fixtures/has_image.docx differ
diff --git a/pydocx/fixtures/has_missing_image.docx b/pydocx/fixtures/has_missing_image.docx
new file mode 100644
index 00000000..996e6671
Binary files /dev/null and b/pydocx/fixtures/has_missing_image.docx differ
diff --git a/pydocx/fixtures/has_title.docx b/pydocx/fixtures/has_title.docx
new file mode 100644
index 00000000..a87d88ed
Binary files /dev/null and b/pydocx/fixtures/has_title.docx differ
diff --git a/pydocx/fixtures/header_footer_problem.docx b/pydocx/fixtures/header_footer_problem.docx
new file mode 100644
index 00000000..6bc49a7a
Binary files /dev/null and b/pydocx/fixtures/header_footer_problem.docx differ
diff --git a/pydocx/fixtures/headers.docx b/pydocx/fixtures/headers.docx
new file mode 100644
index 00000000..890104c7
Binary files /dev/null and b/pydocx/fixtures/headers.docx differ
diff --git a/pydocx/fixtures/headers_with_full_line_styles.docx b/pydocx/fixtures/headers_with_full_line_styles.docx
new file mode 100644
index 00000000..38d6f6a8
Binary files /dev/null and b/pydocx/fixtures/headers_with_full_line_styles.docx differ
diff --git a/pydocx/fixtures/include_tabs.docx b/pydocx/fixtures/include_tabs.docx
new file mode 100644
index 00000000..f7f53e92
Binary files /dev/null and b/pydocx/fixtures/include_tabs.docx differ
diff --git a/pydocx/fixtures/inline_tags.docx b/pydocx/fixtures/inline_tags.docx
new file mode 100644
index 00000000..4aba2347
Binary files /dev/null and b/pydocx/fixtures/inline_tags.docx differ
diff --git a/pydocx/fixtures/justification.docx b/pydocx/fixtures/justification.docx
new file mode 100644
index 00000000..7f8a3bf1
Binary files /dev/null and b/pydocx/fixtures/justification.docx differ
diff --git a/pydocx/fixtures/list_in_table.docx b/pydocx/fixtures/list_in_table.docx
new file mode 100644
index 00000000..d1a87388
Binary files /dev/null and b/pydocx/fixtures/list_in_table.docx differ
diff --git a/pydocx/fixtures/list_to_header.docx b/pydocx/fixtures/list_to_header.docx
new file mode 100644
index 00000000..f9b3946e
Binary files /dev/null and b/pydocx/fixtures/list_to_header.docx differ
diff --git a/pydocx/fixtures/lists_with_styles.docx b/pydocx/fixtures/lists_with_styles.docx
new file mode 100644
index 00000000..c1c7ecf8
Binary files /dev/null and b/pydocx/fixtures/lists_with_styles.docx differ
diff --git a/pydocx/fixtures/localDpi.docx b/pydocx/fixtures/localDpi.docx
new file mode 100644
index 00000000..0f6d7f77
Binary files /dev/null and b/pydocx/fixtures/localDpi.docx differ
diff --git a/pydocx/fixtures/missing_content.docx b/pydocx/fixtures/missing_content.docx
new file mode 100644
index 00000000..21bed964
Binary files /dev/null and b/pydocx/fixtures/missing_content.docx differ
diff --git a/pydocx/fixtures/missing_numbering.docx b/pydocx/fixtures/missing_numbering.docx
new file mode 100644
index 00000000..5034f524
Binary files /dev/null and b/pydocx/fixtures/missing_numbering.docx differ
diff --git a/pydocx/fixtures/missing_style.docx b/pydocx/fixtures/missing_style.docx
new file mode 100644
index 00000000..3ded985c
Binary files /dev/null and b/pydocx/fixtures/missing_style.docx differ
diff --git a/pydocx/fixtures/nested_lists.docx b/pydocx/fixtures/nested_lists.docx
new file mode 100644
index 00000000..0f9cecbd
Binary files /dev/null and b/pydocx/fixtures/nested_lists.docx differ
diff --git a/pydocx/fixtures/nested_table_rowspan.docx b/pydocx/fixtures/nested_table_rowspan.docx
new file mode 100644
index 00000000..b43b8a0d
Binary files /dev/null and b/pydocx/fixtures/nested_table_rowspan.docx differ
diff --git a/pydocx/fixtures/nested_tables.docx b/pydocx/fixtures/nested_tables.docx
new file mode 100644
index 00000000..af704d4d
Binary files /dev/null and b/pydocx/fixtures/nested_tables.docx differ
diff --git a/pydocx/fixtures/no_break_hyphen.docx b/pydocx/fixtures/no_break_hyphen.docx
new file mode 100644
index 00000000..64d68fa3
Binary files /dev/null and b/pydocx/fixtures/no_break_hyphen.docx differ
diff --git a/pydocx/fixtures/resized_image.docx b/pydocx/fixtures/resized_image.docx
new file mode 100644
index 00000000..913099c4
Binary files /dev/null and b/pydocx/fixtures/resized_image.docx differ
diff --git a/pydocx/fixtures/shift_enter.docx b/pydocx/fixtures/shift_enter.docx
new file mode 100644
index 00000000..4128c0a2
Binary files /dev/null and b/pydocx/fixtures/shift_enter.docx differ
diff --git a/pydocx/fixtures/simple.docx b/pydocx/fixtures/simple.docx
new file mode 100644
index 00000000..1d2a1c23
Binary files /dev/null and b/pydocx/fixtures/simple.docx differ
diff --git a/pydocx/fixtures/simple_lists.docx b/pydocx/fixtures/simple_lists.docx
new file mode 100644
index 00000000..c09ad744
Binary files /dev/null and b/pydocx/fixtures/simple_lists.docx differ
diff --git a/pydocx/fixtures/simple_table.docx b/pydocx/fixtures/simple_table.docx
new file mode 100644
index 00000000..26de483c
Binary files /dev/null and b/pydocx/fixtures/simple_table.docx differ
diff --git a/pydocx/fixtures/special_chars.docx b/pydocx/fixtures/special_chars.docx
new file mode 100644
index 00000000..b4b9287f
Binary files /dev/null and b/pydocx/fixtures/special_chars.docx differ
diff --git a/pydocx/fixtures/split_header.docx b/pydocx/fixtures/split_header.docx
new file mode 100644
index 00000000..cc4bd5cf
Binary files /dev/null and b/pydocx/fixtures/split_header.docx differ
diff --git a/pydocx/fixtures/styled_bolding.docx b/pydocx/fixtures/styled_bolding.docx
new file mode 100644
index 00000000..90c6b157
Binary files /dev/null and b/pydocx/fixtures/styled_bolding.docx differ
diff --git a/pydocx/fixtures/super_and_subscript.docx b/pydocx/fixtures/super_and_subscript.docx
new file mode 100644
index 00000000..06ea2d7a
Binary files /dev/null and b/pydocx/fixtures/super_and_subscript.docx differ
diff --git a/pydocx/fixtures/table_col_row_span.docx b/pydocx/fixtures/table_col_row_span.docx
new file mode 100644
index 00000000..856abfdf
Binary files /dev/null and b/pydocx/fixtures/table_col_row_span.docx differ
diff --git a/pydocx/fixtures/tables_in_lists.docx b/pydocx/fixtures/tables_in_lists.docx
new file mode 100644
index 00000000..11859541
Binary files /dev/null and b/pydocx/fixtures/tables_in_lists.docx differ
diff --git a/pydocx/fixtures/track_changes_on.docx b/pydocx/fixtures/track_changes_on.docx
new file mode 100644
index 00000000..dcb7ba1c
Binary files /dev/null and b/pydocx/fixtures/track_changes_on.docx differ
diff --git a/pydocx/fixtures/upper_alpha_all_bold.docx b/pydocx/fixtures/upper_alpha_all_bold.docx
new file mode 100644
index 00000000..d518b2c5
Binary files /dev/null and b/pydocx/fixtures/upper_alpha_all_bold.docx differ
diff --git a/pydocx/lxmlparser.py b/pydocx/lxmlparser.py
deleted file mode 100644
index 94b130d3..00000000
--- a/pydocx/lxmlparser.py
+++ /dev/null
@@ -1,111 +0,0 @@
-import zipfile
-from lxml import etree
-from StringIO import StringIO
-__author__ = 'samportnow'
-
-#for el in tree.iter():
-    # The way lists are handled could double visit certain elements; keep
-    # track of which elements have been visited and skip any that have been
-    # visited already.
-    #if el in visited_nodes:
-        #continue
-with zipfile.ZipFile('/Users/samportnow/Documents/pydocx/helloworld.docx') as f:
-    document = f.read('word/document.xml')
-    numbering= f.read('word/numbering.xml')
-parser=etree.XMLParser(ns_clean=True)
-document=StringIO(document)
-numbering=StringIO(numbering)
-numbering_tree=etree.parse(numbering,parser)
-numbering_namespace=numbering_tree.getroot().nsmap['w']
-visited_els=[]
-
-def get_parsed():
-    parser=etree.XMLParser(ns_clean=True)
-    tree=etree.parse(document,parser)
-    namespace=tree.getroot().nsmap['w']
-    #rpr is run properties for the paragraph mark
-    paragraph=''
-    run_text=''
-    running_text=''
-    for el in tree.iter():
-        if el.tag=='{%s}p' %namespace:
-            for wp in el.iter():
-                if wp.tag =='{%s}ins' %namespace:
-                    for text in wp.iterchildren():
-                        if text not in visited_els:
-                            run_text +='<div class=insert>'+get_text(text,namespace,visited_els)+'</div>'
-                            visited_els.append(text)
-                if wp.tag=='{%s}r' %namespace and wp not in visited_els:
-                    run_text+=get_text(wp,namespace,visited_els)
-                    visited_els.append(wp)
-                if not el.getchildren():
-                    run_text+='<br>'
-                if wp.tag == '{%s}ilvl' %namespace:
-                    for lst in el.iter():
-                        if lst.find('{%s}numId' %namespace) is not None and el not in visited_els:
-                            numval = lst.find('{%s}numId' %namespace).attrib['{%s}val' %namespace]
-                            lst_type=get_list_style(numval)
-                        if get_text(lst,namespace,visited_els) and el not in visited_els and lst_type['{%s}val' %namespace] != 'bullet':
-                            if lst.getnext() is not None:
-                                if lst not in visited_els:
-                                    while lst.getnext() is not None:
-                                        if lst not in visited_els:
-                                            text = get_text(lst,namespace,visited_els)
-                                            next_txt = get_text(lst.getnext(),namespace,visited_els)
-                                            running_text += text + next_txt
-                                            visited_els.append(lst)
-                                            visited_els.append(lst.getnext())
-                                            lst=lst.getnext()
-                                        else:
-                                            run_text += '<li>' + running_text + '</li>'
-                                            break
-                            else:
-                                run_text +='<li>' +  get_text(lst, namespace, visited_els) + '</li>'
-                                visited_els.append(lst)
-    print running_text
-    return run_text
-
-
-def get_text(wp,namespace,visited_els):
-    run_text= ''
-    decorator = ''
-    closing = ''
-    if wp.find('{%s}tab' %namespace) is not None:
-        run_text+='%nbsp'
-    if wp.find('{%s}rPr' %namespace) is not None:
-        for tag in wp.iter():
-            if tag.find('{%s}u' %namespace) is not None:
-                if wp.find('{%s}t' %namespace) is not None:
-                    decorator +='<u>'
-                    closing += '</u>'
-                    visited_els.append(wp.find('{%s}t' %namespace))
-            if tag.find('{%s}i' %namespace) is not None:
-                if wp.find('{%s}t' %namespace) is not None:
-                    decorator += '<i>'
-                    closing += '</i>'
-                    visited_els.append(wp.find('{%s}t' %namespace))
-            if tag.find('{%s}b' %namespace) is not None:
-                if wp.find('{%s}t' %namespace) is not None:
-                    decorator += '<b>'
-                    closing += '</b>'
-                    visited_els.append(wp.find('{%s}t' %namespace))
-        run_text = wp.find('{%s}t' %namespace).text
-        run_text = decorator + run_text + closing
-    if wp.find('{%s}t' %namespace) is not None and wp.find('{%s}t' %namespace) not in visited_els:
-        run_text+=wp.find('{%s}t' %namespace).text
-    return run_text
-
-def get_list_style(numval):
-    ids = numbering_tree.findall('{%s}num' %numbering_namespace)
-    for id in ids:
-        if id.attrib['{%s}numId' %numbering_namespace] == numval:
-            abstractid=id.find('{%s}abstractNumId' %numbering_namespace)
-            abstractid=abstractid.attrib['{%s}val' %numbering_namespace]
-            style_information=numbering_tree.findall('{%s}abstractNum' %numbering_namespace)
-            for info in style_information:
-                if info.attrib['{%s}abstractNumId' %numbering_namespace] == abstractid:
-                    for i in info.iter():
-                        if i.find('{%s}numFmt' %numbering_namespace) is not None:
-                            return i.find('{%s}numFmt' %numbering_namespace).attrib
-
-print get_parsed()
diff --git a/pydocx/parsers/Docx2Html.py b/pydocx/parsers/Docx2Html.py
index bfaad2a6..e4067a10 100644
--- a/pydocx/parsers/Docx2Html.py
+++ b/pydocx/parsers/Docx2Html.py
@@ -1,21 +1,46 @@
-from pydocx.DocxParser import DocxParser
-
+import base64
 import xml.sax.saxutils
 
+from pydocx.DocxParser import DocxParser
+
 
 class Docx2Html(DocxParser):
 
     @property
     def parsed(self):
-        self._parsed = self._parsed.replace('<p></p><p></p>', '<br />')
-        self._parsed = self._parsed.replace('</p><br /><p>', '</p><p>')
-        self._parsed = self._parsed.replace('</p><br /><ul>', '</p><ul>')
-        return (
-            '<html><head><style>.insert{{color:red}}.delete'
-            '{{color:red; text-decoration:line-through}}.center'
-            '{{text-align:center}}.right{{text-align:right}}'
-            '</style></head><body>{content}</body></html>'
-        ).format(content=self._parsed)
+        content = self._parsed
+        content = "<html>%(head)s<body>%(content)s</body></html>" % {
+            'head': self.head(),
+            'content': content,
+        }
+        return unicode(content)
+
+    def head(self):
+        return "<head>%(style)s</head>" % {
+            'style': self.style(),
+        }
+
+    def style(self):
+        result = (
+            '<style>'
+            '.pydocx-insert {color:green;}'
+            '.pydocx-delete {color:red;text-decoration:line-through;}'
+            '.pydocx-center {text-align:center;}'
+            '.pydocx-right {text-align:right;}'
+            '.pydocx-left {text-align:left;}'
+            '.pydocx-comment {color:blue;}'
+            '.pydocx-underline {text-decoration: underline;}'
+            '.pydocx-caps {text-transform:uppercase;}'
+            '.pydocx-small-caps {font-variant: small-caps;}'
+            '.pydocx-strike {text-decoration: line-through;}'
+            '.pydocx-hidden {visibility: hidden;}'
+            'body {width:%(width)spx;margin:0px auto;}'
+            '</style>'
+        ) % {
+            #multiple by (4/3) to get to px
+            'width': (self.page_width * (4 / 3)),
+        }
+        return result
 
     def escape(self, text):
         return xml.sax.saxutils.quoteattr(text)[1:-1]
@@ -26,61 +51,155 @@ def linebreak(self, pre=None):
     def paragraph(self, text, pre=None):
         return '<p>' + text + '</p>'
 
+    def heading(self, text, heading_value):
+        return '<%(tag)s>%(text)s</%(tag)s>' % {
+            'tag': heading_value,
+            'text': text,
+        }
+
     def insertion(self, text, author, date):
         return (
-            "<span class='insert' author='{author}' "
-            "date='{date}'>{text}</span>"
-        ).format(author=author, date=date, text=text)
+            "<span class='pydocx-insert'>%(text)s</span>"
+        ) % {
+            'author': author,
+            'date': date,
+            'text': text,
+        }
+
+    def hyperlink(self, text, href):
+        if text == '':
+            return ''
+        return '<a href="%(href)s">%(text)s</a>' % {
+            'href': href,
+            'text': text,
+        }
+
+    def image_handler(self, image_data, filename):
+        extension = filename.split('.')[-1].lower()
+        b64_encoded_src = 'data:image/%s;base64,%s' % (
+            extension,
+            base64.b64encode(image_data),
+        )
+        b64_encoded_src = self.escape(b64_encoded_src)
+        return b64_encoded_src
+
+    def image(self, image_data, filename, x, y):
+        src = self.image_handler(image_data, filename)
+        if not src:
+            return ''
+        if all([x, y]):
+            return '<img src="%s" height="%s" width="%s" />' % (
+                src,
+                y,
+                x,
+            )
+        else:
+            return '<img src="%s" />' % src
 
     def deletion(self, text, author, date):
         return (
-            "<span class='delete' author='{author}' "
-            "date='{date}'>{text}</span>"
-        ).format(author=author, date=date, text=text)
+            "<span class='pydocx-delete'>%(text)s</span>"
+        ) % {
+            'author': author,
+            'date': date,
+            'text': text,
+        }
 
     def list_element(self, text):
-        return "<li>{text}</li>".format(text=text)
+        return "<li>%(text)s</li>" % {
+            'text': text,
+        }
 
-    def ordered_list(self, text):
-        return "<ol>{text}</ol>".format(text=text)
+    def ordered_list(self, text, list_style):
+        return '<ol list-style-type="%(list_style)s">%(text)s</ol>' % {
+            'text': text,
+            'list_style': list_style,
+        }
 
     def unordered_list(self, text):
-        return "<ul>{text}</ul>".format(text=text)
+        return "<ul>%(text)s</ul>" % {
+            'text': text,
+        }
 
     def bold(self, text):
-        return '<b>' + text + '</b>'
+        return '<strong>' + text + '</strong>'
 
     def italics(self, text):
-        return '<i>' + text + '</i>'
+        return '<em>' + text + '</em>'
 
     def underline(self, text):
-        return '<u>' + text + '</u>'
+        return '<span class="pydocx-underline">' + text + '</span>'
+
+    def caps(self, text):
+        return '<span class="pydocx-caps">' + text + '</span>'
+
+    def small_caps(self, text):
+        return '<span class="pydocx-small-caps">' + text + '</span>'
+
+    def strike(self, text):
+        return '<span class="pydocx-strike">' + text + '</span>'
+
+    def hide(self, text):
+        return '<span class="pydocx-hidden">' + text + '</span>'
+
+    def superscript(self, text):
+        return '<sup>%(text)s</sup>' % {
+            'text': text,
+        }
+
+    def subscript(self, text):
+        return '<sub>%(text)s</sub>' % {
+            'text': text,
+        }
 
     def tab(self):
         # Insert before the text right?? So got the text and just do an insert
         # at the beginning!
-        return '&nbsp&nbsp&nbsp&nbsp'
+        return '&nbsp;&nbsp;&nbsp;&nbsp;'
 
     def table(self, text):
-        return '<table border=1>' + text + '</table>'
+        return '<table border="1">' + text + '</table>'
 
     def table_row(self, text):
         return '<tr>' + text + '</tr>'
 
-    def table_cell(self, text):
-        return '<td>' + text + '</td>'
+    def table_cell(self, text, col='', row=''):
+        slug = '<td'
+        if col:
+            slug += ' colspan="%(colspan)s"'
+        if row:
+            slug += ' rowspan="%(rowspan)s"'
+        slug += '>%(text)s</td>'
+        return slug % {
+            'colspan': col,
+            'rowspan': row,
+            'text': text,
+        }
 
     def page_break(self):
-        return '<hr>'
-
-    def center_justify(self, text):
-        return "<div class = 'center'>" + text + '</div>'
-
-    def right_justify(self, text):
-        return "<div class = 'right'>" + text + '</div>'
-
-    def indent(self, text, right, left, firstLine):
-        return "<div style = 'margin-left:{left}pt'>{text}</div>".format(
-            left=left,
-            text=text,
-        )
+        return '<hr />'
+
+    def indent(self, text, just='', firstLine='', left='', right=''):
+        slug = '<div'
+        if just:
+            slug += " class='pydocx-%(just)s'"
+        if firstLine or left or right:
+            slug += " style='"
+            if firstLine:
+                slug += "text-indent:%(firstLine)spx;"
+            if left:
+                slug += "margin-left:%(left)spx;"
+            if right:
+                slug += "margin-right:%(right)spx;"
+            slug += "'"
+        slug += ">%(text)s</div>"
+        return slug % {
+            'text': text,
+            'just': just,
+            'firstLine': firstLine,
+            'left': left,
+            'right': right,
+        }
+
+    def break_tag(self):
+        return '<br />'
diff --git a/pydocx/parsers/Docx2Markdown.py b/pydocx/parsers/Docx2Markdown.py
index 1bb43e16..d023df7a 100644
--- a/pydocx/parsers/Docx2Markdown.py
+++ b/pydocx/parsers/Docx2Markdown.py
@@ -1,5 +1,6 @@
 from pydocx.DocxParser import DocxParser
 
+
 class Docx2Markdown(DocxParser):
     def escape(self, text):
         return text
@@ -17,8 +18,9 @@ def bold(self, text):
         return '**' + text + '**'
 
     def italics(self, text):
-        # TODO do we need a "pre" variable, so I can check for *italics**italics* and turn it into *italicsitatlics*?
+        # TODO do we need a "pre" variable, so I can check for
+        # *italics**italics* and turn it into *italicsitatlics*?
         return '*' + text + '*'
 
     def underline(self, text):
-        return '***' +text + '***'
\ No newline at end of file
+        return '***' + text + '***'
diff --git a/pydocx/parsers/__init__.py b/pydocx/parsers/__init__.py
index a9524657..7684ae65 100644
--- a/pydocx/parsers/__init__.py
+++ b/pydocx/parsers/__init__.py
@@ -1,2 +1,4 @@
-from .Docx2Html import *
-from .Docx2Markdown import *
\ No newline at end of file
+from pydocx.parsers.Docx2Html import Docx2Html
+from pydocx.parsers.Docx2Markdown import Docx2Markdown
+
+__all__ = (Docx2Html, Docx2Markdown)
diff --git a/pydocx/tests/__init__.py b/pydocx/tests/__init__.py
new file mode 100644
index 00000000..82341e05
--- /dev/null
+++ b/pydocx/tests/__init__.py
@@ -0,0 +1,196 @@
+#from unittest import TestCase
+import re
+from contextlib import contextmanager
+
+from pydocx.parsers.Docx2Html import Docx2Html
+from pydocx.utils import (
+    parse_xml_from_string,
+)
+from pydocx.tests.document_builder import DocxBuilder as DXB
+from unittest import TestCase
+
+STYLE = (
+    '<style>'
+    '.pydocx-insert {color:green;}'
+    '.pydocx-delete {color:red;text-decoration:line-through;}'
+    '.pydocx-center {text-align:center;}'
+    '.pydocx-right {text-align:right;}'
+    '.pydocx-left {text-align:left;}'
+    '.pydocx-comment {color:blue;}'
+    '.pydocx-underline {text-decoration: underline;}'
+    '.pydocx-caps {text-transform:uppercase;}'
+    '.pydocx-small-caps {font-variant: small-caps;}'
+    '.pydocx-strike {text-decoration: line-through;}'
+    '.pydocx-hidden {visibility: hidden;}'
+    'body {width:612px;margin:0px auto;}'
+    '</style>'
+)
+
+BASE_HTML = '''
+<html>
+    <head>
+    %s
+    </head>
+    <body>%%s</body>
+</html>
+''' % STYLE
+
+
+def assert_html_equal(actual_html, expected_html):
+    assert collapse_html(
+        actual_html,
+    ) == collapse_html(
+        expected_html
+    ), actual_html
+
+
+def collapse_html(html):
+    """
+    Remove insignificant whitespace from the html.
+
+    >>> print collapse_html('''\\
+    ...     <h1>
+    ...         Heading
+    ...     </h1>
+    ... ''')
+    <h1>Heading</h1>
+    >>> print collapse_html('''\\
+    ...     <p>
+    ...         Paragraph with
+    ...         multiple lines.
+    ...     </p>
+    ... ''')
+    <p>Paragraph with multiple lines.</p>
+    """
+    def smart_space(match):
+        # Put a space in between lines, unless exactly one side of the line
+        # break butts up against a tag.
+        before = match.group(1)
+        after = match.group(2)
+        space = ' '
+        if before == '>' or after == '<':
+            space = ''
+        return before + space + after
+    # Replace newlines and their surrounding whitespace with a single space (or
+    # empty string)
+    html = re.sub(
+        r'(>?)\s*\n\s*(<?)',
+        smart_space,
+        html,
+    )
+    return html.strip()
+
+
+class XMLDocx2Html(Docx2Html):
+    """
+    Create the object without passing in a path to the document, set them
+    manually.
+    """
+    def __init__(self, *args, **kwargs):
+        # Pass in nothing for the path
+        super(XMLDocx2Html, self).__init__(path=None, *args, **kwargs)
+
+    def _build_data(
+            self,
+            path,
+            document_xml=None,
+            rels_dict=None,
+            numbering_dict=None,
+            styles_dict=None,
+            *args, **kwargs):
+        self._test_rels_dict = rels_dict
+        if rels_dict:
+            for value in rels_dict.values():
+                self._image_data['word/%s' % value] = 'word/%s' % value
+        self.numbering_root = None
+        if numbering_dict is not None:
+            self.numbering_root = parse_xml_from_string(
+                DXB.numbering(numbering_dict),
+            )
+        self.numbering_dict = numbering_dict
+        # Intentionally not calling super
+        if document_xml is not None:
+            self.root = parse_xml_from_string(document_xml)
+        self.zip_path = ''
+
+        # This is the standard page width for a word document, Also the page
+        # width that we are looking for in the test.
+        self.page_width = 612
+
+        self.styles_dict = styles_dict
+
+    def _parse_rels_root(self, *args, **kwargs):
+        if self._test_rels_dict is None:
+            return {}
+        return self._test_rels_dict
+
+    def get_list_style(self, num_id, ilvl):
+        try:
+            return self.numbering_dict[num_id][ilvl]
+        except KeyError:
+            return 'decimal'
+
+    def _parse_styles(self):
+        if self.styles_dict is None:
+            return {}
+        return self.styles_dict
+
+
+DEFAULT_NUMBERING_DICT = {
+    '1': {
+        '0': 'decimal',
+        '1': 'decimal',
+    },
+    '2': {
+        '0': 'lowerLetter',
+        '1': 'lowerLetter',
+    },
+}
+
+
+class _TranslationTestCase(TestCase):
+    expected_output = None
+    relationship_dict = None
+    styles_dict = None
+    numbering_dict = DEFAULT_NUMBERING_DICT
+    run_expected_output = True
+    parser = XMLDocx2Html
+    use_base_html = True
+    convert_root_level_upper_roman = False
+
+    def get_xml(self):
+        raise NotImplementedError()
+
+    @contextmanager
+    def toggle_run_expected_output(self):
+        self.run_expected_output = not self.run_expected_output
+        yield
+        self.run_expected_output = not self.run_expected_output
+
+    def test_expected_output(self):
+        if self.expected_output is None:
+            raise NotImplementedError('expected_output is not defined')
+        if not self.run_expected_output:
+            return
+
+        # Create the xml
+        tree = self.get_xml()
+
+        # Verify the final output.
+        parser = self.parser
+
+        def image_handler(self, src, *args, **kwargs):
+            return src
+        parser.image_handler = image_handler
+        html = parser(
+            convert_root_level_upper_roman=self.convert_root_level_upper_roman,
+            document_xml=tree,
+            rels_dict=self.relationship_dict,
+            numbering_dict=self.numbering_dict,
+            styles_dict=self.styles_dict,
+        ).parsed
+
+        if self.use_base_html:
+            assert_html_equal(html, BASE_HTML % self.expected_output)
+        else:
+            assert_html_equal(html, self.expected_output)
diff --git a/pydocx/tests/document_builder.py b/pydocx/tests/document_builder.py
new file mode 100644
index 00000000..c28e1e02
--- /dev/null
+++ b/pydocx/tests/document_builder.py
@@ -0,0 +1,271 @@
+from jinja2 import Environment, PackageLoader
+from pydocx.DocxParser import EMUS_PER_PIXEL
+
+templates = {
+    'delete': 'text_delete.xml',
+    'drawing': 'drawing.xml',
+    'hyperlink': 'hyperlink.xml',
+    'insert': 'insert.xml',
+    'linebreak': 'linebreak.xml',
+    'main': 'base.xml',
+    'numbering': 'numbering.xml',
+    'p': 'p.xml',
+    'pict': 'pict.xml',
+    'r': 'r.xml',
+    'rpr': 'rpr.xml',
+    'sdt': 'sdt.xml',
+    'sectPr': 'sectPr.xml',
+    'smartTag': 'smart_tag.xml',
+    'style': 'style.xml',
+    'styles': 'styles.xml',
+    't': 't.xml',
+    'table': 'table.xml',
+    'tc': 'tc.xml',
+    'tr': 'tr.xml',
+}
+
+env = Environment(
+    loader=PackageLoader(
+        'pydocx.tests',
+        'templates',
+    ),
+)
+
+
+class DocxBuilder(object):
+
+    @classmethod
+    def xml(self, body):
+        template = env.get_template(templates['main'])
+        return template.render(body=body)
+
+    @classmethod
+    def p_tag(
+            self,
+            text,
+            style='style0',
+            jc=None,
+    ):
+        if isinstance(text, str):
+            # Use create a single r tag based on the text and the bold
+            run_tag = DocxBuilder.r_tag(
+                [DocxBuilder.t_tag(text)],
+            )
+            run_tags = [run_tag]
+        elif isinstance(text, list):
+            run_tags = text
+        else:
+            run_tags = [self.r_tag([])]
+        template = env.get_template(templates['p'])
+
+        kwargs = {
+            'run_tags': run_tags,
+            'style': style,
+            'jc': jc,
+        }
+        return template.render(**kwargs)
+
+    @classmethod
+    def linebreak(self):
+        template = env.get_template(templates['linebreak'])
+        kwargs = {}
+        return template.render(**kwargs)
+
+    @classmethod
+    def t_tag(self, text):
+        template = env.get_template(templates['t'])
+        kwargs = {
+            'text': text,
+        }
+        return template.render(**kwargs)
+
+    @classmethod
+    def r_tag(
+            self,
+            elements,
+            rpr=None,
+    ):
+        template = env.get_template(templates['r'])
+        if rpr is None:
+            rpr = DocxBuilder.rpr_tag()
+        kwargs = {
+            'elements': elements,
+            'rpr': rpr,
+        }
+        return template.render(**kwargs)
+
+    @classmethod
+    def rpr_tag(self, inline_styles=None, *args, **kwargs):
+        if inline_styles is None:
+            inline_styles = {}
+        valid_styles = (
+            'b',
+            'i',
+            'u',
+            'caps',
+            'smallCaps',
+            'strike',
+            'dstrike',
+            'vanish',
+            'webHidden',
+            'vertAlign',
+        )
+        for key in inline_styles:
+            if key not in valid_styles:
+                raise AssertionError('%s is not a valid style' % key)
+        template = env.get_template(templates['rpr'])
+        kwargs = {
+            'tags': inline_styles,
+        }
+        return template.render(**kwargs)
+
+    @classmethod
+    def hyperlink_tag(self, r_id, run_tags):
+        template = env.get_template(templates['hyperlink'])
+        kwargs = {
+            'r_id': r_id,
+            'run_tags': run_tags,
+        }
+        return template.render(**kwargs)
+
+    @classmethod
+    def insert_tag(self, run_tags):
+        template = env.get_template(templates['insert'])
+        kwargs = {
+            'run_tags': run_tags,
+        }
+        return template.render(**kwargs)
+
+    @classmethod
+    def delete_tag(self, deleted_texts):
+        template = env.get_template(templates['delete'])
+        kwargs = {
+            'deleted_texts': deleted_texts,
+        }
+        return template.render(**kwargs)
+
+    @classmethod
+    def smart_tag(self, run_tags):
+        template = env.get_template(templates['smartTag'])
+        kwargs = {
+            'run_tags': run_tags,
+        }
+        return template.render(**kwargs)
+
+    @classmethod
+    def sdt_tag(self, p_tag):
+        template = env.get_template(templates['sdt'])
+        kwargs = {
+            'p_tag': p_tag,
+        }
+        return template.render(**kwargs)
+
+    @classmethod
+    def li(self, text, ilvl, numId, bold=False):
+        if isinstance(text, str):
+            # Use create a single r tag based on the text and the bold
+            run_tag = DocxBuilder.r_tag([DocxBuilder.t_tag(text)], bold)
+            run_tags = [run_tag]
+        elif isinstance(text, list):
+            run_tags = []
+            for run_text, run_bold in text:
+                run_tags.append(
+                    DocxBuilder.r_tag(
+                        [DocxBuilder.t_tag(run_tags)],
+                        run_bold,
+                    ),
+                )
+        else:
+            raise AssertionError('text must be a string or a list')
+        template = env.get_template(templates['p'])
+
+        kwargs = {
+            'run_tags': run_tags,
+            'is_list': True,
+            'ilvl': ilvl,
+            'numId': numId,
+        }
+        return template.render(**kwargs)
+
+    @classmethod
+    def table_cell(self, paragraph, merge=False, merge_continue=False):
+        kwargs = {
+            'paragraph': paragraph,
+            'merge': merge,
+            'merge_continue': merge_continue
+        }
+        template = env.get_template(templates['tc'])
+        return template.render(**kwargs)
+
+    @classmethod
+    def table_row(self, tcs):
+        template = env.get_template(templates['tr'])
+        return template.render(table_cells=tcs)
+
+    @classmethod
+    def table(self, trs):
+        template = env.get_template(templates['table'])
+        return template.render(table_rows=trs)
+
+    @classmethod
+    def drawing(self, r_id, height=None, width=None):
+        template = env.get_template(templates['drawing'])
+        if height is not None:
+            height = height * EMUS_PER_PIXEL
+        if width is not None:
+            width = width * EMUS_PER_PIXEL
+        kwargs = {
+            'r_id': r_id,
+            'height': height,
+            'width': width,
+        }
+        return template.render(**kwargs)
+
+    @classmethod
+    def pict(self, r_id=None, height=None, width=None):
+        template = env.get_template(templates['pict'])
+        kwargs = {
+            'r_id': r_id,
+            'height': height,
+            'width': width,
+        }
+        return template.render(**kwargs)
+
+    @classmethod
+    def sectPr_tag(self, p_tag):
+        template = env.get_template(templates['sectPr'])
+
+        kwargs = {
+            'p_tag': p_tag,
+        }
+        return template.render(**kwargs)
+
+    @classmethod
+    def styles_xml(self, style_tags):
+        template = env.get_template(templates['styles'])
+
+        kwargs = {
+            'style_tags': style_tags,
+        }
+        return template.render(**kwargs)
+
+    @classmethod
+    def style(self, style_id, value):
+        template = env.get_template(templates['style'])
+
+        kwargs = {
+            'style_id': style_id,
+            'value': value,
+        }
+
+        return template.render(**kwargs)
+
+    @classmethod
+    def numbering(self, numbering_dict):
+        template = env.get_template(templates['numbering'])
+
+        kwargs = {
+            'numbering_dict': numbering_dict,
+        }
+
+        return template.render(**kwargs)
diff --git a/pydocx/tests/templates/base.xml b/pydocx/tests/templates/base.xml
new file mode 100644
index 00000000..60027500
--- /dev/null
+++ b/pydocx/tests/templates/base.xml
@@ -0,0 +1,4 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<w:document xmlns:o="urn:schemas-microsoft-com:office:office" xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships" xmlns:v="urn:schemas-microsoft-com:vml" xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main" xmlns:w10="urn:schemas-microsoft-com:office:word" xmlns:wp="http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing">
+	<w:body>{{ body }}</w:body>
+</w:document>
diff --git a/pydocx/tests/templates/drawing.xml b/pydocx/tests/templates/drawing.xml
new file mode 100644
index 00000000..dfd470b4
--- /dev/null
+++ b/pydocx/tests/templates/drawing.xml
@@ -0,0 +1,65 @@
+<w:p>
+	<w:pPr>
+		<w:pStyle w:val="style0"/>
+	</w:pPr>
+	<w:r>
+		<w:rPr/>
+		<w:drawing>
+			<wp:anchor allowOverlap="1" behindDoc="0" distB="0" distL="0" distR="0" distT="0" layoutInCell="1" locked="0" relativeHeight="0" simplePos="0">
+				<wp:simplePos x="0" y="0"/>
+				<wp:positionH relativeFrom="column">
+					<wp:posOffset>2397125</wp:posOffset>
+				</wp:positionH>
+				<wp:positionV relativeFrom="paragraph">
+					<wp:posOffset>0</wp:posOffset>
+				</wp:positionV>
+				<wp:extent cx="1537970" cy="354965"/>
+				<wp:effectExtent b="0" l="0" r="0" t="0"/>
+				<wp:wrapSquare wrapText="largest"/>
+				<wp:docPr descr="A description..." id="1" name="Picture"/>
+				<wp:cNvGraphicFramePr>
+					<a:graphicFrameLocks xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main" noChangeAspect="1"/>
+				</wp:cNvGraphicFramePr>
+				<a:graphic xmlns:a="http://schemas.openxmlformats.org/drawingml/2006/main">
+					<a:graphicData uri="http://schemas.openxmlformats.org/drawingml/2006/picture">
+						<pic:pic xmlns:pic="http://schemas.openxmlformats.org/drawingml/2006/picture">
+							<pic:nvPicPr>
+								<pic:cNvPr descr="A description..." id="0" name="Picture"/>
+								<pic:cNvPicPr>
+									<a:picLocks noChangeArrowheads="1" noChangeAspect="1"/>
+								</pic:cNvPicPr>
+							</pic:nvPicPr>
+							<pic:blipFill>
+								<a:blip r:embed="{{ r_id }}"/>
+								<a:srcRect/>
+								<a:stretch>
+									<a:fillRect/>
+								</a:stretch>
+							</pic:blipFill>
+							<pic:spPr bwMode="auto">
+								<a:xfrm>
+									<a:off x="0" y="0"/>
+									<a:ext {% if width != None %}
+                                    cx="{{ width }}" {% endif%}
+                                    {% if height != None %}
+                                    cy="{{ height }}"
+                                    {% endif %}/>
+								</a:xfrm>
+								<a:prstGeom prst="rect">
+									<a:avLst/>
+								</a:prstGeom>
+								<a:noFill/>
+								<a:ln w="9525">
+									<a:noFill/>
+									<a:miter lim="800000"/>
+									<a:headEnd/>
+									<a:tailEnd/>
+								</a:ln>
+							</pic:spPr>
+						</pic:pic>
+					</a:graphicData>
+				</a:graphic>
+			</wp:anchor>
+		</w:drawing>
+	</w:r>
+</w:p>
diff --git a/pydocx/tests/templates/hyperlink.xml b/pydocx/tests/templates/hyperlink.xml
new file mode 100644
index 00000000..83645948
--- /dev/null
+++ b/pydocx/tests/templates/hyperlink.xml
@@ -0,0 +1,5 @@
+<w:hyperlink r:id="{{ r_id }}">
+	{% for run_tag in run_tags %}
+		{{ run_tag }}
+	{% endfor %}
+</w:hyperlink>
diff --git a/pydocx/tests/templates/insert.xml b/pydocx/tests/templates/insert.xml
new file mode 100644
index 00000000..afeb2691
--- /dev/null
+++ b/pydocx/tests/templates/insert.xml
@@ -0,0 +1,5 @@
+<w:ins>
+	{% for run_tag in run_tags %}
+		{{ run_tag }}
+	{% endfor %}
+</w:ins>
diff --git a/pydocx/tests/templates/linebreak.xml b/pydocx/tests/templates/linebreak.xml
new file mode 100644
index 00000000..ab92e811
--- /dev/null
+++ b/pydocx/tests/templates/linebreak.xml
@@ -0,0 +1 @@
+<w:br/>
diff --git a/pydocx/tests/templates/numbering.xml b/pydocx/tests/templates/numbering.xml
new file mode 100644
index 00000000..4eaac3cc
--- /dev/null
+++ b/pydocx/tests/templates/numbering.xml
@@ -0,0 +1,23 @@
+<?xml version="1.0"?>
+<w:numbering xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
+	{% for num_id, ilvl_data in numbering_dict.items() %}
+	<w:abstractNum w:abstractNumId="{{ num_id }}">
+		{% for ilvl, format in ilvl_data.items() %}
+		<w:lvl w:ilvl="{{ ilvl }}">
+			<w:start w:val="1"/>
+			<w:numFmt w:val="{{ format }}"/>
+			<w:lvlText w:val="%1."/>
+			<w:lvlJc w:val="start"/>
+			<w:pPr>
+				<w:ind w:hanging="0" w:start="0"/>
+			</w:pPr>
+		</w:lvl>
+		{% endfor %}
+	</w:abstractNum>
+	{% endfor %}
+	{% for num_id in numbering_dict %}
+	<w:num w:numId="{{ num_id }}">
+		<w:abstractNumId w:val="{{ num_id }}"/>
+	</w:num>
+	{% endfor %}
+</w:numbering>
diff --git a/pydocx/tests/templates/p.xml b/pydocx/tests/templates/p.xml
new file mode 100644
index 00000000..7a78a060
--- /dev/null
+++ b/pydocx/tests/templates/p.xml
@@ -0,0 +1,19 @@
+<w:p>
+	<w:pPr>
+		<w:pStyle{% if style %} w:val="{{ style }}"{% endif %}/>
+		{% if is_list %}
+		<w:numPr>
+			{% if ilvl != None %}
+			<w:ilvl w:val="{{ ilvl }}"/>
+			{% endif %}
+			{% if numId != None %}
+			<w:numId w:val="{{ numId }}"/>
+			{% endif %}
+		</w:numPr>
+		{% endif %}
+		{% if jc %}<w:jc w:val="{{ jc }}"/>{% endif %}
+	</w:pPr>
+	{% for run_tag in run_tags %}
+		{{ run_tag }}
+	{% endfor %}
+</w:p>
diff --git a/pydocx/tests/templates/pict.xml b/pydocx/tests/templates/pict.xml
new file mode 100644
index 00000000..26f772a3
--- /dev/null
+++ b/pydocx/tests/templates/pict.xml
@@ -0,0 +1,17 @@
+<w:p w:rsidR="00E94BDC" w:rsidRPr="003638EA" w:rsidRDefault="00E94BDC" w:rsidP="00E94BDC">
+    <w:pPr>
+        <w:rPr>
+            <w:color w:val="000000"/>
+        </w:rPr>
+    </w:pPr>
+    <w:r w:rsidR="00360165">
+        <w:rPr>
+            <w:color w:val="000000"/>
+        </w:rPr>
+        <w:pict>
+            <v:shape id="_x0000_i1027" type="#_x0000_t75" {%if width or height %}style="{% if width %}width:{{ width }}pt;{%endif%}{% if height %}height:{{ height }}pt{%endif%}"{% endif %}>
+            {% if r_id %}<v:imagedata r:id="{{ r_id }}" o:title="New Picture"/>{% endif %}
+        </v:shape>
+    </w:pict>
+</w:r>
+        </w:p>
diff --git a/pydocx/tests/templates/r.xml b/pydocx/tests/templates/r.xml
new file mode 100644
index 00000000..2f28a66b
--- /dev/null
+++ b/pydocx/tests/templates/r.xml
@@ -0,0 +1,6 @@
+<w:r>
+	{{ rpr }}
+	{% for element in elements %}
+		{{ element }}
+	{% endfor %}
+</w:r>
diff --git a/pydocx/tests/templates/rpr.xml b/pydocx/tests/templates/rpr.xml
new file mode 100644
index 00000000..f49eb08b
--- /dev/null
+++ b/pydocx/tests/templates/rpr.xml
@@ -0,0 +1,5 @@
+	<w:rPr>
+		{% for tag, value in tags.items() %}
+		<w:{{ tag }} {% if value %} w:val="{{ value }}"{% endif %}/>
+		{% endfor %}
+	</w:rPr>
diff --git a/pydocx/tests/templates/sdt.xml b/pydocx/tests/templates/sdt.xml
new file mode 100644
index 00000000..fe9a7e77
--- /dev/null
+++ b/pydocx/tests/templates/sdt.xml
@@ -0,0 +1,16 @@
+<w:sdt>
+	<w:sdtPr>
+		<w:rPr>
+			<w:rFonts w:ascii="Times New Roman" w:hAnsi="Times New Roman" w:cs="Times New Roman"/>
+			<w:sz w:val="22"/>
+		</w:rPr>
+		<w:alias w:val="PolicyTemplateTitle"/>
+		<w:tag w:val="PolicyTemplateTitle"/>
+		<w:id w:val="95087797"/>
+		<w:lock w:val="sdtLocked"/>
+		<w:text/>
+	</w:sdtPr>
+	<w:sdtContent>
+		{{ p_tag }}
+	</w:sdtContent>
+</w:sdt>
diff --git a/pydocx/tests/templates/sectPr.xml b/pydocx/tests/templates/sectPr.xml
new file mode 100644
index 00000000..16a12050
--- /dev/null
+++ b/pydocx/tests/templates/sectPr.xml
@@ -0,0 +1,3 @@
+<w:sectPr xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
+	{{ p_tag }}
+</w:sectPr>
diff --git a/pydocx/tests/templates/smart_tag.xml b/pydocx/tests/templates/smart_tag.xml
new file mode 100644
index 00000000..e45ee5b9
--- /dev/null
+++ b/pydocx/tests/templates/smart_tag.xml
@@ -0,0 +1,5 @@
+<w:smartTag>
+	{% for run_tag in run_tags %}
+		{{ run_tag }}
+	{% endfor %}
+</w:smartTag>
diff --git a/pydocx/tests/templates/style.xml b/pydocx/tests/templates/style.xml
new file mode 100644
index 00000000..5fa9f00f
--- /dev/null
+++ b/pydocx/tests/templates/style.xml
@@ -0,0 +1,15 @@
+<w:style w:styleId="{{ style_id }}">
+	<w:name w:val="{{ value }}"/>
+	<w:basedOn w:val="Normal"/>
+	<w:next w:val="Normal"/>
+	<w:pPr>
+		<w:ind w:hanging="461"/>
+		<w:ind w:left="485"/>
+		<w:spacing w:after="60"/>
+		<w:spacing w:before="61"/>
+	</w:pPr>
+	<w:rPr>
+		<w:sz w:val="24"/>
+		<w:rFonts w:ascii="Times New Roman" w:cs="Times New Roman" w:hAnsi="Times New Roman"/>
+	</w:rPr>
+</w:style>
diff --git a/pydocx/tests/templates/styles.xml b/pydocx/tests/templates/styles.xml
new file mode 100644
index 00000000..a30e752e
--- /dev/null
+++ b/pydocx/tests/templates/styles.xml
@@ -0,0 +1,6 @@
+<?xml version="1.0"?>
+<w:styles xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships" xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
+	{% for style in style_tags %}
+		{{ style }}
+	{% endfor %}
+</w:styles>
diff --git a/pydocx/tests/templates/t.xml b/pydocx/tests/templates/t.xml
new file mode 100644
index 00000000..81d562b7
--- /dev/null
+++ b/pydocx/tests/templates/t.xml
@@ -0,0 +1,5 @@
+{% if text %}
+<w:t>{{ text }}</w:t>
+{% else %}
+<w:t />
+{% endif %}
diff --git a/pydocx/tests/templates/table.xml b/pydocx/tests/templates/table.xml
new file mode 100644
index 00000000..e47783b6
--- /dev/null
+++ b/pydocx/tests/templates/table.xml
@@ -0,0 +1,18 @@
+<w:tbl>
+	<w:tblPr>
+		<w:tblW w:type="dxa" w:w="9972"/>
+		<w:jc w:val="left"/>
+		<w:tblBorders>
+			<w:top w:color="000000" w:space="0" w:sz="2" w:val="single"/>
+			<w:left w:color="000000" w:space="0" w:sz="2" w:val="single"/>
+			<w:bottom w:color="000000" w:space="0" w:sz="2" w:val="single"/>
+		</w:tblBorders>
+	</w:tblPr>
+	<w:tblGrid>
+		<w:gridCol w:w="4986"/>
+		<w:gridCol w:w="4986"/>
+	</w:tblGrid>
+	{% for table_row in table_rows %}
+		{{ table_row }}
+	{% endfor %}
+</w:tbl>
diff --git a/pydocx/tests/templates/tc.xml b/pydocx/tests/templates/tc.xml
new file mode 100644
index 00000000..eff9ce0d
--- /dev/null
+++ b/pydocx/tests/templates/tc.xml
@@ -0,0 +1,28 @@
+<w:tc>
+	<w:tcPr>
+		<w:tcW w:type="dxa" w:w="4986"/>
+        {% if merge_continue %}
+        <w:vMerge>
+        </w:vMerge>
+        {% endif %}
+        {% if merge %}
+        <w:vMerge val="restart">
+        </w:vMerge>
+        {% endif %}
+		<w:tcBorders>
+			<w:top w:color="000000" w:space="0" w:sz="2" w:val="single"/>
+			<w:left w:color="000000" w:space="0" w:sz="2" w:val="single"/>
+			<w:bottom w:color="000000" w:space="0" w:sz="2" w:val="single"/>
+		</w:tcBorders>
+		<w:shd w:fill="auto" w:val="clear"/>
+		<w:tcMar>
+			<w:top w:type="dxa" w:w="55"/>
+			<w:left w:type="dxa" w:w="55"/>
+			<w:bottom w:type="dxa" w:w="55"/>
+			<w:right w:type="dxa" w:w="55"/>
+		</w:tcMar>
+	</w:tcPr>
+    {% if paragraph %}
+	{{ paragraph }}
+    {% endif %}
+</w:tc>
diff --git a/pydocx/tests/templates/text_delete.xml b/pydocx/tests/templates/text_delete.xml
new file mode 100644
index 00000000..783b3ad3
--- /dev/null
+++ b/pydocx/tests/templates/text_delete.xml
@@ -0,0 +1,10 @@
+<w:del w:id="12" w:author="mfiem" w:date="2008-02-27T06:48:00Z">
+	{% for deleted_text in deleted_texts %}
+	<w:r w:rsidDel="005D3333">
+		<w:rPr>
+			<w:rFonts w:ascii="Times New Roman" w:hAnsi="Times New Roman"/>
+		</w:rPr>
+		<w:delText>{{ deleted_text }}</w:delText>
+	</w:r>
+	{% endfor %}
+</w:del>
diff --git a/pydocx/tests/templates/tr.xml b/pydocx/tests/templates/tr.xml
new file mode 100644
index 00000000..6e2f6925
--- /dev/null
+++ b/pydocx/tests/templates/tr.xml
@@ -0,0 +1,8 @@
+<w:tr>
+	<w:trPr>
+		<w:cantSplit w:val="false"/>
+	</w:trPr>
+	{% for table_cell in table_cells %}
+		{{ table_cell }}
+	{% endfor %}
+</w:tr>
diff --git a/pydocx/tests/test_docx.py b/pydocx/tests/test_docx.py
new file mode 100644
index 00000000..e9d77533
--- /dev/null
+++ b/pydocx/tests/test_docx.py
@@ -0,0 +1,849 @@
+import base64
+from os import path
+from tempfile import NamedTemporaryFile
+
+from nose.plugins.skip import SkipTest
+from nose.tools import raises
+
+from pydocx.tests import assert_html_equal, BASE_HTML
+from pydocx.parsers.Docx2Html import Docx2Html
+from pydocx.DocxParser import ZipFile
+from pydocx.exceptions import MalformedDocxException
+
+
+def convert(path, *args, **kwargs):
+    return Docx2Html(path, *args, **kwargs).parsed
+
+
+def test_extract_html():
+    file_path = path.join(
+        path.abspath(path.dirname(__file__)),
+        '..',
+        'fixtures',
+        'simple.docx',
+    )
+    actual_html = convert(file_path)
+    assert_html_equal(actual_html, BASE_HTML % '''
+        <p>
+          Simple text
+        </p>
+        <ol list-style-type="decimal">
+          <li>one</li>
+          <li>two</li>
+          <li>three</li>
+        </ol>
+        <table border="1">
+          <tr>
+            <td>Cell1</td>
+            <td>Cell2</td>
+          </tr>
+          <tr>
+            <td>Cell3</td>
+            <td>Cell4</td>
+          </tr>
+        </table>
+    ''')
+
+
+def test_nested_list():
+    file_path = path.join(
+        path.abspath(path.dirname(__file__)),
+        '..',
+        'fixtures',
+        'nested_lists.docx',
+    )
+    actual_html = convert(file_path)
+    assert_html_equal(actual_html, BASE_HTML % '''
+        <ol list-style-type="decimal">
+            <li>one</li>
+            <li>two</li>
+            <li>three
+                <ol list-style-type="decimal">
+                    <li>AAA</li>
+                    <li>BBB</li>
+                    <li>CCC
+                        <ol list-style-type="decimal">
+                            <li>alpha</li>
+                        </ol>
+                    </li>
+                </ol>
+            </li>
+            <li>four</li>
+        </ol>
+        <ol list-style-type="decimal">
+            <li>xxx
+                <ol list-style-type="decimal">
+                    <li>yyy</li>
+                </ol>
+            </li>
+        </ol>
+        <ul>
+            <li>www
+                <ul>
+                    <li>zzz</li>
+                </ul>
+            </li>
+        </ul>
+    ''')
+
+
+def test_simple_list():
+    file_path = path.join(
+        path.abspath(path.dirname(__file__)),
+        '..',
+        'fixtures',
+        'simple_lists.docx',
+    )
+    actual_html = convert(file_path)
+    assert_html_equal(actual_html, BASE_HTML % '''
+        <ol list-style-type="decimal">
+            <li>One</li>
+        </ol>
+        <ul>
+            <li>two</li>
+        </ul>
+    ''')
+
+
+def test_inline_tags():
+    file_path = path.join(
+        path.abspath(path.dirname(__file__)),
+        '..',
+        'fixtures',
+        'inline_tags.docx',
+    )
+    actual_html = convert(file_path)
+    assert_html_equal(actual_html, BASE_HTML % (
+        '<p>This sentence has some <strong>bold</strong>, '
+        'some <em>italics</em> and some '
+        '<span class="pydocx-underline">underline</span>, '
+        'as well as a <a href="http://www.google.com/">hyperlink</a>.</p>'
+    ))
+
+
+def test_all_configured_styles():
+    file_path = path.join(
+        path.abspath(path.dirname(__file__)),
+        '..',
+        'fixtures',
+        'all_configured_styles.docx',
+    )
+    actual_html = convert(file_path)
+    assert_html_equal(actual_html, BASE_HTML % '''
+        <p><strong>aaa</strong></p>
+        <p><span class="pydocx-underline">bbb</span></p>
+        <p><em>ccc</em></p>
+        <p><span class="pydocx-caps">ddd</span></p>
+        <p><span class="pydocx-small-caps">eee</span></p>
+        <p><span class="pydocx-strike">fff</span></p>
+        <p><span class="pydocx-strike">ggg</span></p>
+        <p><span class="pydocx-hidden">hhh</span></p>
+        <p><span class="pydocx-hidden">iii</span></p>
+    ''')
+
+
+def test_super_and_subscript():
+    file_path = path.join(
+        path.abspath(path.dirname(__file__)),
+        '..',
+        'fixtures',
+        'super_and_subscript.docx',
+    )
+    actual_html = convert(file_path)
+    assert_html_equal(actual_html, BASE_HTML % '''
+        <p>AAA<sup>BBB</sup></p>
+        <p><sub>CCC</sub>DDD</p>
+    ''')
+
+
+def test_unicode():
+    file_path = path.join(
+        path.abspath(path.dirname(__file__)),
+        '..',
+        'fixtures',
+        'greek_alphabet.docx',
+    )
+    actual_html = convert(file_path)
+    assert actual_html is not None
+    assert u'\u0391\u03b1' in actual_html
+
+
+def test_special_chars():
+    file_path = path.join(
+        path.abspath(path.dirname(__file__)),
+        '..',
+        'fixtures',
+        'special_chars.docx',
+    )
+    actual_html = convert(file_path)
+    assert_html_equal(actual_html, BASE_HTML % '''
+    <p>&amp; &lt; &gt; <a href="https://www.google.com/?test=1&amp;more=2">link</a></p>''')  # noqa
+
+
+def test_include_tabs():
+    file_path = path.join(
+        path.abspath(path.dirname(__file__)),
+        '..',
+        'fixtures',
+        'include_tabs.docx',
+    )
+    actual_html = convert(file_path)
+    assert_html_equal(
+        actual_html,
+        BASE_HTML % '<p>AAA&nbsp;&nbsp;&nbsp;&nbsp;BBB</p>'
+    )
+
+
+def test_table_col_row_span():
+    file_path = path.join(
+        path.abspath(path.dirname(__file__)),
+        '..',
+        'fixtures',
+        'table_col_row_span.docx',
+    )
+    actual_html = convert(file_path)
+    assert_html_equal(actual_html, BASE_HTML % '''
+      <table border="1">
+        <tr>
+          <td colspan="2">AAA</td>
+        </tr>
+        <tr>
+          <td rowspan="2">BBB</td>
+          <td>CCC</td>
+        </tr>
+        <tr>
+          <td>DDD</td>
+        </tr>
+        <tr>
+          <td>
+          <div class='pydocx-right'>EEE
+          </div></td>
+          <td rowspan="2">FFF</td>
+        </tr>
+        <tr>
+          <td>
+           <div class='pydocx-right'>GGG
+           </div></td>
+        </tr>
+      </table>
+      <table border="1">
+        <tr>
+          <td>1</td>
+          <td>2</td>
+          <td>3</td>
+          <td>4</td>
+        </tr>
+        <tr>
+          <td>5</td>
+          <td colspan="2" rowspan="2">6</td>
+          <td>7</td>
+        </tr>
+        <tr>
+          <td>8</td>
+          <td>9</td>
+        </tr>
+        <tr>
+          <td>10</td>
+          <td>11</td>
+          <td>12</td>
+          <td>13</td>
+        </tr>
+      </table>
+    ''')
+
+
+def test_nested_table_rowspan():
+    file_path = path.join(
+        path.abspath(path.dirname(__file__)),
+        '..',
+        'fixtures',
+        'nested_table_rowspan.docx',
+    )
+    actual_html = convert(file_path)
+    assert_html_equal(actual_html, BASE_HTML % '''
+        <table border="1">
+            <tr>
+                <td colspan="2">AAA</td>
+            </tr>
+            <tr>
+                <td>BBB</td>
+                <td>
+                    <table border="1">
+                        <tr>
+                            <td rowspan="2">CCC</td>
+                            <td>DDD</td>
+                        </tr>
+                        <tr>
+                            <td>EEE</td>
+                        </tr>
+                    </table>
+                </td>
+            </tr>
+        </table>
+    ''')
+
+
+def test_nested_tables():
+    file_path = path.join(
+        path.abspath(path.dirname(__file__)),
+        '..',
+        'fixtures',
+        'nested_tables.docx',
+    )
+    actual_html = convert(file_path)
+    # Find out why br tag is there.
+    assert_html_equal(actual_html, BASE_HTML % '''
+        <table border="1">
+            <tr>
+                <td>AAA</td>
+                <td>BBB</td>
+            </tr>
+            <tr>
+                <td>CCC</td>
+                <td>
+                    <table border="1">
+                        <tr>
+                            <td>DDD</td>
+                            <td>EEE</td>
+                        </tr>
+                        <tr>
+                            <td>FFF</td>
+                            <td>GGG</td>
+                        </tr>
+                    </table>
+                </td>
+            </tr>
+        </table>
+    ''')
+
+
+def test_list_in_table():
+    file_path = path.join(
+        path.abspath(path.dirname(__file__)),
+        '..',
+        'fixtures',
+        'list_in_table.docx',
+    )
+    actual_html = convert(file_path)
+    assert_html_equal(actual_html, BASE_HTML % '''
+        <table border="1">
+          <tr>
+            <td>
+              <ol list-style-type="decimal">
+                <li>AAA</li>
+                <li>BBB</li>
+                <li>CCC</li>
+              </ol>
+            </td>
+          </tr>
+        </table>
+    ''')
+
+
+def test_tables_in_lists():
+    file_path = path.join(
+        path.abspath(path.dirname(__file__)),
+        '..',
+        'fixtures',
+        'tables_in_lists.docx',
+    )
+    actual_html = convert(file_path)
+    assert_html_equal(actual_html, BASE_HTML % '''
+        <ol list-style-type="decimal">
+            <li>AAA</li>
+            <li>BBB
+                <table border="1">
+                    <tr>
+                        <td>CCC</td>
+                        <td>DDD</td>
+                    </tr>
+                    <tr>
+                        <td>EEE</td>
+                        <td>FFF</td>
+                    </tr>
+                </table>
+            </li>
+            <li>GGG</li>
+        </ol>
+    ''')
+
+
+def test_track_changes_on():
+    file_path = path.join(
+        path.abspath(path.dirname(__file__)),
+        '..',
+        'fixtures',
+        'track_changes_on.docx',
+    )
+    actual_html = convert(file_path)
+    assert_html_equal(actual_html, BASE_HTML % '''
+    <p>This was some content.</p>
+    ''')
+
+
+def test_headers():
+    file_path = path.join(
+        path.abspath(path.dirname(__file__)),
+        '..',
+        'fixtures',
+        'headers.docx',
+    )
+    actual_html = convert(file_path)
+    assert_html_equal(actual_html, BASE_HTML % '''
+        <h1>This is an H1</h1>
+        <h2>This is an H2</h2>
+        <h3>This is an H3</h3>
+        <h4>This is an H4</h4>
+        <h5>This is an H5</h5>
+        <h6>This is an H6</h6>
+        <h6>This is an H7</h6>
+        <h6>This is an H8</h6>
+        <h6>This is an H9</h6>
+        <h6>This is an H10</h6>
+    ''')
+
+
+def test_split_headers():
+    file_path = path.join(
+        path.abspath(path.dirname(__file__)),
+        '..',
+        'fixtures',
+        'split_header.docx',
+    )
+
+    actual_html = convert(file_path)
+    assert_html_equal(actual_html, BASE_HTML % '''
+    <h1>AAA</h1><p>BBB</p><h1>CCC</h1>
+    ''')
+
+
+def get_image_data(docx_file_path, image_name):
+    """
+    Return base 64 encoded data for the image_name that is stored in the
+    docx_file_path.
+    """
+    with ZipFile(docx_file_path) as f:
+        images = [
+            e for e in f.infolist()
+            if e.filename == 'word/media/%s' % image_name
+        ]
+        if not images:
+            raise AssertionError('%s not in %s' % (image_name, docx_file_path))
+        data = f.read(images[0].filename)
+    return base64.b64encode(data)
+
+
+def test_has_image():
+    file_path = path.join(
+        path.abspath(path.dirname(__file__)),
+        '..',
+        'fixtures',
+        'has_image.docx',
+    )
+
+    actual_html = convert(file_path)
+    image_data = get_image_data(file_path, 'image1.gif')
+    assert_html_equal(actual_html, BASE_HTML % '''
+        <p>
+            AAA
+            <img src="data:image/gif;base64,%s" height="55px" width="260px" />
+        </p>
+    ''' % image_data)
+
+
+def test_local_dpi():
+    # The image in this file does not have a set height or width, show that the
+    # html will generate without it.
+    file_path = path.join(
+        path.abspath(path.dirname(__file__)),
+        '..',
+        'fixtures',
+        'localDpi.docx',
+    )
+    actual_html = convert(file_path)
+    image_data = get_image_data(file_path, 'image1.jpeg')
+    assert_html_equal(actual_html, BASE_HTML % '''
+        <p><img src="data:image/jpeg;base64,%s" /></p>
+    ''' % image_data)
+
+
+def test_has_image_using_image_handler():
+    raise SkipTest('This needs to be converted to an xml test')
+    file_path = path.join(
+        path.abspath(path.dirname(__file__)),
+        '..',
+        'fixtures',
+        'has_image.docx',
+    )
+
+    def image_handler(*args, **kwargs):
+        return 'test'
+    actual_html = convert(file_path)
+    assert_html_equal(actual_html, BASE_HTML % '''
+        <p>AAA<img src="test" height="55" width="260" /></p>
+    ''')
+
+
+def test_headers_with_full_line_styles():
+    raise SkipTest('This test is not yet passing')
+    # Show that if a natural header is completely bold/italics that
+    # bold/italics will get stripped out.
+    file_path = path.join(
+        path.abspath(path.dirname(__file__)),
+        '..',
+        'fixtures',
+        'headers_with_full_line_styles.docx',
+    )
+    actual_html = convert(file_path)
+    assert_html_equal(actual_html, BASE_HTML % '''
+        <h2>AAA</h2>
+        <h2>BBB</h2>
+        <h2><strong>C</strong><em>C</em>C</h2>
+    ''')
+
+
+def test_convert_p_to_h():
+    raise SkipTest('This test is not yet passing')
+    # Show when it is correct to convert a p tag to an h tag based on
+    # bold/italics
+    file_path = path.join(
+        path.abspath(path.dirname(__file__)),
+        '..',
+        'fixtures',
+        'convert_p_to_h.docx',
+    )
+    actual_html = convert(file_path)
+    assert_html_equal(actual_html, BASE_HTML % '''
+        <h2>AAA</h2>
+        <h2>BBB</h2>
+        <p>CCC</p>
+        <ol list-style-type="decimal">
+            <li><strong>DDD</strong></li>
+            <li><em>EEE</em></li>
+            <li>FFF</li>
+        </ol>
+        <table border="1">
+            <tr>
+                <td><strong>GGG</strong></td>
+                <td><em>HHH</em></td>
+            </tr>
+            <tr>
+                <td>III</td>
+                <td>JJJ</td>
+            </tr>
+        </table>
+    ''')
+
+
+def test_fake_headings_by_length():
+    raise SkipTest('This test is not yet passing')
+    # Show that converting p tags to h tags has a length limit. If the p tag is
+    # supposed to be converted to an h tag but has more than seven words in the
+    # paragraph do not convert it.
+    file_path = path.join(
+        path.abspath(path.dirname(__file__)),
+        '..',
+        'fixtures',
+        'fake_headings_by_length.docx',
+    )
+    actual_html = convert(file_path)
+    assert_html_equal(actual_html, BASE_HTML % '''
+        <h2>Heading.</h2>
+        <h2>Still a heading.</h2>
+        <p>
+        <strong>This is not a heading because it is too many words.</strong>
+        </p>
+    ''')
+
+
+def test_shift_enter():
+    file_path = path.join(
+        path.abspath(path.dirname(__file__)),
+        '..',
+        'fixtures',
+        'shift_enter.docx',
+    )
+
+    # Test just the convert without clean_html to make sure the first
+    # break tag is present.
+    actual_html = convert(file_path)
+    assert_html_equal(actual_html, BASE_HTML % '''
+        <p>AAA<br />BBB</p>
+        <p>CCC</p>
+        <ol list-style-type="decimal">
+            <li>DDD<br />EEE</li>
+            <li>FFF</li>
+        </ol>
+        <table border="1">
+            <tr>
+                <td>GGG<br />HHH</td>
+                <td>III<br />JJJ</td>
+            </tr>
+            <tr>
+                <td>KKK</td>
+                <td>LLL</td>
+            </tr>
+        </table>
+    ''')
+
+
+def test_lists_with_styles():
+    file_path = path.join(
+        path.abspath(path.dirname(__file__)),
+        '..',
+        'fixtures',
+        'lists_with_styles.docx',
+    )
+    actual_html = convert(file_path)
+    assert_html_equal(actual_html, BASE_HTML % '''
+        <ol list-style-type="decimal">
+            <li>AAA</li>
+            <li>BBB
+                <ol list-style-type="lowerRoman">
+                    <li>CCC</li>
+                    <li>DDD
+                        <ol list-style-type="upperLetter">
+                            <li>EEE
+                                <ol list-style-type="lowerLetter">
+                                    <li>FFF</li>
+                                </ol>
+                            </li>
+                        </ol>
+                    </li>
+                </ol>
+            </li>
+        </ol>
+    ''')
+
+
+def test_list_to_header():
+    file_path = path.join(
+        path.abspath(path.dirname(__file__)),
+        '..',
+        'fixtures',
+        'list_to_header.docx',
+    )
+    actual_html = convert(file_path, convert_root_level_upper_roman=True)
+    # It should be noted that list item `GGG` is upper roman in the word
+    # document to show that only top level upper romans get converted.
+    assert_html_equal(actual_html, BASE_HTML % '''
+        <h2>AAA</h2>
+        <ol list-style-type="decimal">
+            <li>BBB</li>
+        </ol>
+        <h2>CCC</h2>
+        <ol list-style-type="decimal">
+            <li>DDD</li>
+        </ol>
+        <h2>EEE</h2>
+        <ol list-style-type="decimal">
+            <li>FFF
+                <ol list-style-type="upperRoman">
+                    <li>GGG</li>
+                </ol>
+            </li>
+        </ol>
+    ''')
+
+
+def test_has_title():
+    file_path = path.join(
+        path.abspath(path.dirname(__file__)),
+        '..',
+        'fixtures',
+        'has_title.docx',
+    )
+    actual_html = convert(file_path)
+    assert_html_equal(actual_html, BASE_HTML % '''
+        <p>Title</p>
+        <p><div class='pydocx-left'>Text</div></p>
+    ''')
+
+
+def test_upper_alpha_all_bold():
+    raise SkipTest('This test is not yet passing')
+    file_path = path.join(
+        path.abspath(path.dirname(__file__)),
+        '..',
+        'fixtures',
+        'upper_alpha_all_bold.docx',
+    )
+    actual_html = convert(file_path)
+    assert_html_equal(actual_html, BASE_HTML % '''
+        <h2>AAA</h2>
+        <h2>BBB</h2>
+        <h2>CCC</h2>
+    ''')
+
+
+def test_simple_table():
+    file_path = path.join(
+        path.abspath(path.dirname(__file__)),
+        '..',
+        'fixtures',
+        'simple_table.docx',
+    )
+    actual_html = convert(file_path)
+    assert_html_equal(actual_html, BASE_HTML % '''
+    <table border="1">
+        <tr>
+            <td rowspan="2">
+                Cell1<br />
+                Cell3
+            </td>
+            <td>Cell2<br />
+                And I am writing in the table
+            </td>
+        </tr>
+        <tr>
+            <td>Cell4</td>
+        </tr>
+    </table>
+    ''')
+
+
+def test_justification():
+    file_path = path.join(
+        path.abspath(path.dirname(__file__)),
+        '..',
+        'fixtures',
+        'justification.docx',
+    )
+    actual_html = convert(file_path)
+    assert_html_equal(actual_html, BASE_HTML % '''
+    <p>
+        <div class='pydocx-center'>Center Justified</div>
+    </p>
+    <p>
+        <div class='pydocx-right'>Right justified</div>
+    </p>
+    <p>
+        <div class='pydocx-right' style='margin-right:96.0px;'>
+            Right justified and pushed in from right
+        </div>
+    </p>
+    <p>
+        <div class='pydocx-center'
+                style='margin-left:252.0px;margin-right:96.0px;'>
+            Center justified and pushed in from left and it is
+            great and it is the coolest thing of all time and I like it and
+            I think it is cool
+        </div>
+    </p>
+    <p>
+        <div style='margin-left:252.0px;margin-right:96.0px;'>
+            Left justified and pushed in from left
+        </div>
+    </p>
+    ''')
+
+
+def test_missing_style():
+    file_path = path.join(
+        path.abspath(path.dirname(__file__)),
+        '..',
+        'fixtures',
+        'missing_style.docx',
+    )
+    actual_html = convert(file_path)
+    assert_html_equal(actual_html, BASE_HTML % '''
+    <p>AAA</p>
+    ''')
+
+
+def test_missing_numbering():
+    file_path = path.join(
+        path.abspath(path.dirname(__file__)),
+        '..',
+        'fixtures',
+        'missing_numbering.docx',
+    )
+    actual_html = convert(file_path)
+    assert_html_equal(actual_html, BASE_HTML % '''
+    <p>AAA</p>
+    <p>BBB</p>
+    ''')
+
+
+def test_styled_bolding():
+    file_path = path.join(
+        path.abspath(path.dirname(__file__)),
+        '..',
+        'fixtures',
+        'styled_bolding.docx',
+    )
+    actual_html = convert(file_path)
+    assert_html_equal(actual_html, BASE_HTML % '''
+    <p><strong>AAA</strong></p>
+    <p><strong>BBB</strong></p>
+    ''')
+
+
+def test_no_break_hyphen():
+    file_path = path.join(
+        path.abspath(path.dirname(__file__)),
+        '..',
+        'fixtures',
+        'no_break_hyphen.docx',
+    )
+    actual_html = convert(file_path)
+    assert_html_equal(actual_html, BASE_HTML % '''
+    <p>AAA-BBB</p>
+    ''')
+
+
+@raises(MalformedDocxException)
+def test_malformed_docx_exception():
+    with NamedTemporaryFile(suffix='.docx') as f:
+        convert(f.name)
+
+
+def _converter(*args, **kwargs):
+    # Having a converter that does nothing is the same as if abiword fails to
+    # convert.
+    pass
+
+
+#def test_converter_broken():
+#    file_path = 'test.doc'
+#    assert_raises(
+#        ConversionFailed,
+#        lambda: convert(file_path, converter=_converter),
+#    )
+
+
+def test_fall_back():
+    raise SkipTest('This test is not yet passing')
+    file_path = 'test.doc'
+
+    def fall_back(*args, **kwargs):
+        return 'success'
+    html = convert(file_path, fall_back=fall_back, converter=_converter)
+    assert html == 'success'
+
+
+#@mock.patch('docx2html.core.read_html_file')
+#@mock.patch('docx2html.core.get_zip_file_handler')
+#def test_html_files(patch_zip_handler, patch_read):
+def test_html_files():
+    raise SkipTest('This test is not yet passing')
+
+    def raise_assertion(*args, **kwargs):
+        raise AssertionError('Should not have called get_zip_file_handler')
+    #patch_zip_handler.side_effect = raise_assertion
+
+    def return_text(*args, **kwargs):
+        return 'test'
+    #patch_read.side_effect = return_text
+
+    # Try with an html file
+    file_path = 'test.html'
+
+    html = convert(file_path)
+    assert html == 'test'
+
+    # Try again with an htm file.
+    file_path = 'test.htm'
+
+    html = convert(file_path)
+    assert html == 'test'
diff --git a/pydocx/tests/test_xml.py b/pydocx/tests/test_xml.py
new file mode 100644
index 00000000..904ed2b4
--- /dev/null
+++ b/pydocx/tests/test_xml.py
@@ -0,0 +1,1351 @@
+# -*- coding: utf-8 -*-
+import os
+import time
+
+from nose.plugins.skip import SkipTest
+
+from pydocx.tests.document_builder import DocxBuilder as DXB
+from pydocx.tests import (
+    XMLDocx2Html,
+    _TranslationTestCase,
+)
+from pydocx.utils import parse_xml_from_string, find_all
+
+
+class StyleIsOnTestCase(_TranslationTestCase):
+    expected_output = """
+        <p><strong>AAA</strong></p>
+        <p>BBB</p>
+        <p>CCC</p>
+        <p>DDD</p>
+    """
+
+    def get_xml(self):
+        tags = [
+            DXB.p_tag(
+                [
+                    DXB.r_tag(
+                        [DXB.t_tag('AAA')],
+                        rpr=DXB.rpr_tag({'b': None}),
+                    ),
+                ],
+            ),
+            DXB.p_tag(
+                [
+                    DXB.r_tag(
+                        [DXB.t_tag('BBB')],
+                        rpr=DXB.rpr_tag({'b': 'false'}),
+                    ),
+                ],
+            ),
+            DXB.p_tag(
+                [
+                    DXB.r_tag(
+                        [DXB.t_tag('CCC')],
+                        rpr=DXB.rpr_tag({'b': '0'}),
+                    ),
+                ],
+            ),
+            DXB.p_tag(
+                [
+                    DXB.r_tag(
+                        [DXB.t_tag('DDD')],
+                        rpr=DXB.rpr_tag({'u': 'none'}),
+                    ),
+                ],
+            ),
+        ]
+
+        body = ''
+        for tag in tags:
+            body += tag
+        xml = DXB.xml(body)
+        return xml
+
+
+class HyperlinkVanillaTestCase(_TranslationTestCase):
+
+    relationship_dict = {
+        'rId0': 'www.google.com',
+    }
+
+    expected_output = '''
+        <p><a href="www.google.com">link</a>.</p>
+    '''
+
+    def get_xml(self):
+        run_tags = []
+        run_tags.append(DXB.r_tag([DXB.t_tag('link')]))
+        run_tags = [DXB.hyperlink_tag(r_id='rId0', run_tags=run_tags)]
+        run_tags.append(DXB.r_tag([DXB.t_tag('.')]))
+        body = DXB.p_tag(run_tags)
+        xml = DXB.xml(body)
+        return xml
+
+
+class HyperlinkWithMultipleRunsTestCase(_TranslationTestCase):
+    relationship_dict = {
+        'rId0': 'www.google.com',
+    }
+
+    expected_output = '''
+        <p><a href="www.google.com">link</a>.</p>
+    '''
+
+    def get_xml(self):
+        run_tags = [DXB.r_tag([DXB.t_tag(i)]) for i in 'link']
+        run_tags = [DXB.hyperlink_tag(r_id='rId0', run_tags=run_tags)]
+        run_tags.append(DXB.r_tag([DXB.t_tag('.')]))
+        body = DXB.p_tag(run_tags)
+        xml = DXB.xml(body)
+        return xml
+
+
+class HyperlinkNoTextTestCase(_TranslationTestCase):
+    relationship_dict = {
+        'rId0': 'www.google.com',
+    }
+
+    expected_output = ''
+
+    def get_xml(self):
+        run_tags = []
+        run_tags = [DXB.hyperlink_tag(r_id='rId0', run_tags=run_tags)]
+        body = DXB.p_tag(run_tags)
+        xml = DXB.xml(body)
+        return xml
+
+
+class HyperlinkNotInRelsDictTestCase(_TranslationTestCase):
+    relationship_dict = {
+        # 'rId0': 'www.google.com', missing
+    }
+
+    expected_output = '<p>link.</p>'
+
+    def get_xml(self):
+        run_tags = []
+        run_tags.append(DXB.r_tag([DXB.t_tag('link')]))
+        run_tags = [DXB.hyperlink_tag(r_id='rId0', run_tags=run_tags)]
+        run_tags.append(DXB.r_tag([DXB.t_tag('.')]))
+        body = DXB.p_tag(run_tags)
+        xml = DXB.xml(body)
+        return xml
+
+
+class HyperlinkWithBreakTestCase(_TranslationTestCase):
+    relationship_dict = {
+        'rId0': 'www.google.com',
+    }
+
+    expected_output = '<p><a href="www.google.com">link<br /></a></p>'
+
+    def get_xml(self):
+        run_tags = []
+        run_tags.append(DXB.r_tag([DXB.t_tag('link')]))
+        run_tags.append(DXB.r_tag([DXB.linebreak()]))
+        run_tags = [DXB.hyperlink_tag(r_id='rId0', run_tags=run_tags)]
+        body = DXB.p_tag(run_tags)
+        xml = DXB.xml(body)
+        return xml
+
+
+class ImageLocal(_TranslationTestCase):
+    relationship_dict = {
+        'rId0': 'media/image1.jpeg',
+        'rId1': 'media/image2.jpeg',
+    }
+    expected_output = '''
+    <p><img src="word/media/image1.jpeg" /></p>
+    <p><img src="word/media/image2.jpeg" /></p>
+    '''
+
+    def get_xml(self):
+        drawing = DXB.drawing(height=None, width=None, r_id='rId0')
+        pict = DXB.pict(height=None, width=None, r_id='rId1')
+        tags = [
+            drawing,
+            pict,
+        ]
+        body = ''
+        for el in tags:
+            body += el
+
+        xml = DXB.xml(body)
+        return xml
+
+
+class ImageTestCase(_TranslationTestCase):
+    relationship_dict = {
+        'rId0': 'media/image1.jpeg',
+        'rId1': 'media/image2.jpeg',
+    }
+    expected_output = '''
+        <p>
+            <img src="word/media/image1.jpeg" height="20px" width="40px" />
+        </p>
+        <p>
+            <img src="word/media/image2.jpeg" height="21pt" width="41pt" />
+        </p>
+    '''
+
+    def get_xml(self):
+        drawing = DXB.drawing(height=20, width=40, r_id='rId0')
+        pict = DXB.pict(height=21, width=41, r_id='rId1')
+        tags = [
+            drawing,
+            pict,
+        ]
+        body = ''
+        for el in tags:
+            body += el
+
+        xml = DXB.xml(body)
+        return xml
+
+    def test_get_image_id(self):
+        parser = XMLDocx2Html(
+            document_xml=self.get_xml(),
+            rels_dict=self.relationship_dict,
+        )
+        tree = parse_xml_from_string(self.get_xml())
+        els = []
+        els.extend(find_all(tree, 'drawing'))
+        els.extend(find_all(tree, 'pict'))
+        image_ids = []
+        for el in els:
+            image_ids.append(parser._get_image_id(el))
+        expected = [
+            'rId0',
+            'rId1',
+        ]
+        self.assertEqual(
+            set(image_ids),
+            set(expected),
+        )
+
+    def test_get_image_sizes(self):
+        parser = XMLDocx2Html(
+            document_xml=self.get_xml(),
+            rels_dict=self.relationship_dict,
+        )
+        tree = parse_xml_from_string(self.get_xml())
+        els = []
+        els.extend(find_all(tree, 'drawing'))
+        els.extend(find_all(tree, 'pict'))
+        image_ids = []
+        for el in els:
+            image_ids.append(parser._get_image_size(el))
+        expected = [
+            ('40px', '20px'),
+            ('41pt', '21pt'),
+        ]
+        self.assertEqual(
+            set(image_ids),
+            set(expected),
+        )
+
+
+class ImageNotInRelsDictTestCase(_TranslationTestCase):
+    relationship_dict = {
+        # 'rId0': 'media/image1.jpeg',
+    }
+    expected_output = ''
+
+    def get_xml(self):
+        drawing = DXB.drawing(height=20, width=40, r_id='rId0')
+        body = drawing
+
+        xml = DXB.xml(body)
+        return xml
+
+
+class ImageNoSizeTestCase(_TranslationTestCase):
+    relationship_dict = {
+        'rId0': os.path.join(
+            os.path.abspath(os.path.dirname(__file__)),
+            '..',
+            'fixtures',
+            'bullet_go_gray.png',
+        )
+    }
+    image_sizes = {
+        'rId0': (0, 0),
+    }
+    expected_output = '''
+        <html>
+            <p>
+                <img src="%s" />
+            </p>
+        </html>
+    ''' % relationship_dict['rId0']
+
+    @staticmethod
+    def image_handler(image_id, relationship_dict):
+        return relationship_dict.get(image_id)
+
+    def get_xml(self):
+        raise SkipTest(
+            'Since we are not using PIL, we do not need this test yet.',
+        )
+        drawing = DXB.drawing('rId0')
+        tags = [
+            drawing,
+        ]
+        body = ''
+        for el in tags:
+            body += el
+
+        xml = DXB.xml(body)
+        return xml
+
+
+class TableTag(_TranslationTestCase):
+    expected_output = '''
+        <table border="1">
+            <tr>
+                <td>AAA</td>
+                <td>BBB</td>
+            </tr>
+            <tr>
+                <td>CCC</td>
+                <td>DDD</td>
+            </tr>
+        </table>
+    '''
+
+    def get_xml(self):
+        cell1 = DXB.table_cell(paragraph=DXB.p_tag('AAA'))
+        cell2 = DXB.table_cell(paragraph=DXB.p_tag('CCC'))
+        cell3 = DXB.table_cell(paragraph=DXB.p_tag('BBB'))
+        cell4 = DXB.table_cell(paragraph=DXB.p_tag('DDD'))
+        rows = [DXB.table_row([cell1, cell3]), DXB.table_row([cell2, cell4])]
+        table = DXB.table(rows)
+        body = table
+        xml = DXB.xml(body)
+        return xml
+
+
+class RowSpanTestCase(_TranslationTestCase):
+
+    expected_output = '''
+           <table border="1">
+            <tr>
+                <td rowspan="2">AAA</td>
+                <td>BBB</td>
+            </tr>
+            <tr>
+                <td>CCC</td>
+            </tr>
+        </table>
+    '''
+
+    def get_xml(self):
+        cell1 = DXB.table_cell(
+            paragraph=DXB.p_tag('AAA'), merge=True, merge_continue=False)
+        cell2 = DXB.table_cell(
+            paragraph=DXB.p_tag(None), merge=False, merge_continue=True)
+        cell3 = DXB.table_cell(paragraph=DXB.p_tag('BBB'))
+        cell4 = DXB.table_cell(paragraph=DXB.p_tag('CCC'))
+        rows = [DXB.table_row([cell1, cell3]), DXB.table_row([cell2, cell4])]
+        table = DXB.table(rows)
+        body = table
+        xml = DXB.xml(body)
+        return xml
+
+
+class NestedTableTag(_TranslationTestCase):
+    expected_output = '''
+        <table border="1">
+            <tr>
+                <td>AAA</td>
+                <td>BBB</td>
+            </tr>
+            <tr>
+                <td>CCC</td>
+                <td>
+                    <table border="1">
+                        <tr>
+                            <td>DDD</td>
+                            <td>EEE</td>
+                        </tr>
+                        <tr>
+                            <td>FFF</td>
+                            <td>GGG</td>
+                        </tr>
+                    </table>
+                </td>
+            </tr>
+        </table>
+    '''
+
+    def get_xml(self):
+        cell1 = DXB.table_cell(paragraph=DXB.p_tag('DDD'))
+        cell2 = DXB.table_cell(paragraph=DXB.p_tag('FFF'))
+        cell3 = DXB.table_cell(paragraph=DXB.p_tag('EEE'))
+        cell4 = DXB.table_cell(paragraph=DXB.p_tag('GGG'))
+        rows = [DXB.table_row([cell1, cell3]), DXB.table_row([cell2, cell4])]
+        nested_table = DXB.table(rows)
+        cell1 = DXB.table_cell(paragraph=DXB.p_tag('AAA'))
+        cell2 = DXB.table_cell(paragraph=DXB.p_tag('CCC'))
+        cell3 = DXB.table_cell(paragraph=DXB.p_tag('BBB'))
+        cell4 = DXB.table_cell(nested_table)
+        rows = [DXB.table_row([cell1, cell3]), DXB.table_row([cell2, cell4])]
+        table = DXB.table(rows)
+        body = table
+        xml = DXB.xml(body)
+        return xml
+
+
+class TableWithInvalidTag(_TranslationTestCase):
+    expected_output = '''
+        <table border="1">
+            <tr>
+                <td>AAA</td>
+                <td>BBB</td>
+            </tr>
+            <tr>
+                <td></td>
+                <td>DDD</td>
+            </tr>
+        </table>
+    '''
+
+    def get_xml(self):
+        cell1 = DXB.table_cell(paragraph=DXB.p_tag('AAA'))
+        cell2 = DXB.table_cell('<w:invalidTag>CCC</w:invalidTag>')
+        cell3 = DXB.table_cell(paragraph=DXB.p_tag('BBB'))
+        cell4 = DXB.table_cell(paragraph=DXB.p_tag('DDD'))
+        rows = [DXB.table_row([cell1, cell3]), DXB.table_row([cell2, cell4])]
+        table = DXB.table(rows)
+        body = table
+        xml = DXB.xml(body)
+        return xml
+
+
+class TableWithListAndParagraph(_TranslationTestCase):
+    expected_output = '''
+        <table border="1">
+            <tr>
+                <td>
+                    <ol list-style-type="decimal">
+                        <li>AAA</li>
+                        <li>BBB</li>
+                    </ol>
+                    CCC<br />
+                    DDD
+                </td>
+            </tr>
+        </table>
+    '''
+
+    def get_xml(self):
+        li_text = [
+            ('AAA', 0, 1),
+            ('BBB', 0, 1),
+        ]
+        lis = ''
+        for text, ilvl, numId in li_text:
+            lis += DXB.li(text=text, ilvl=ilvl, numId=numId)
+        els = [
+            lis,
+            DXB.p_tag('CCC'),
+            DXB.p_tag('DDD'),
+        ]
+        td = ''
+        for el in els:
+            td += el
+        cell1 = DXB.table_cell(td)
+        row = DXB.table_row([cell1])
+        table = DXB.table([row])
+        body = table
+        xml = DXB.xml(body)
+        return xml
+
+
+class SimpleListTestCase(_TranslationTestCase):
+    expected_output = '''
+        <ol list-style-type="lowerLetter">
+            <li>AAA</li>
+            <li>BBB</li>
+            <li>CCC</li>
+        </ol>
+    '''
+
+    # Ensure its not failing somewhere and falling back to decimal
+    numbering_dict = {
+        '1': {
+            '0': 'lowerLetter',
+        }
+    }
+
+    def get_xml(self):
+        li_text = [
+            ('AAA', 0, 1),
+            ('BBB', 0, 1),
+            ('CCC', 0, 1),
+        ]
+        lis = ''
+        for text, ilvl, numId in li_text:
+            lis += DXB.li(text=text, ilvl=ilvl, numId=numId)
+
+        xml = DXB.xml(lis)
+        return xml
+
+
+class SingleListItemTestCase(_TranslationTestCase):
+    expected_output = '''
+        <ol list-style-type="lowerLetter">
+            <li>AAA</li>
+        </ol>
+    '''
+
+    # Ensure its not failing somewhere and falling back to decimal
+    numbering_dict = {
+        '1': {
+            '0': 'lowerLetter',
+        }
+    }
+
+    def get_xml(self):
+        li_text = [
+            ('AAA', 0, 1),
+        ]
+        lis = ''
+        for text, ilvl, numId in li_text:
+            lis += DXB.li(text=text, ilvl=ilvl, numId=numId)
+
+        xml = DXB.xml(lis)
+        return xml
+
+
+class ListWithContinuationTestCase(_TranslationTestCase):
+    expected_output = '''
+        <ol list-style-type="decimal">
+            <li>AAA<br />BBB</li>
+            <li>CCC
+                <table border="1">
+                    <tr>
+                        <td>DDD</td>
+                        <td>EEE</td>
+                    </tr>
+                    <tr>
+                        <td>FFF</td>
+                        <td>GGG</td>
+                    </tr>
+                </table>
+            </li>
+            <li>HHH</li>
+        </ol>
+    '''
+
+    def get_xml(self):
+        cell1 = DXB.table_cell(paragraph=DXB.p_tag('DDD'))
+        cell2 = DXB.table_cell(paragraph=DXB.p_tag('FFF'))
+        cell3 = DXB.table_cell(paragraph=DXB.p_tag('EEE'))
+        cell4 = DXB.table_cell(paragraph=DXB.p_tag('GGG'))
+        rows = [DXB.table_row([cell1, cell3]), DXB.table_row([cell2, cell4])]
+        table = DXB.table(rows)
+        tags = [
+            DXB.li(text='AAA', ilvl=0, numId=1),
+            DXB.p_tag('BBB'),
+            DXB.li(text='CCC', ilvl=0, numId=1),
+            table,
+            DXB.li(text='HHH', ilvl=0, numId=1),
+        ]
+        body = ''
+        for el in tags:
+            body += el
+
+        xml = DXB.xml(body)
+        return xml
+
+
+class ListWithMultipleContinuationTestCase(_TranslationTestCase):
+    expected_output = '''
+        <ol list-style-type="decimal">
+            <li>AAA
+                <table border="1">
+                    <tr>
+                        <td>BBB</td>
+                    </tr>
+                </table>
+                <table border="1">
+                    <tr>
+                        <td>CCC</td>
+                    </tr>
+                </table>
+            </li>
+            <li>DDD</li>
+        </ol>
+    '''
+
+    def get_xml(self):
+        cell = DXB.table_cell(paragraph=DXB.p_tag('BBB'))
+        row = DXB.table_row([cell])
+        table1 = DXB.table([row])
+        cell = DXB.table_cell(paragraph=DXB.p_tag('CCC'))
+        row = DXB.table_row([cell])
+        table2 = DXB.table([row])
+        tags = [
+            DXB.li(text='AAA', ilvl=0, numId=1),
+            table1,
+            table2,
+            DXB.li(text='DDD', ilvl=0, numId=1),
+        ]
+        body = ''
+        for el in tags:
+            body += el
+
+        xml = DXB.xml(body)
+        return xml
+
+
+class MangledIlvlTestCase(_TranslationTestCase):
+    expected_output = '''
+        <ol list-style-type="lowerLetter">
+            <li>AAA</li>
+        </ol>
+        <ol list-style-type="decimal">
+            <li>BBB
+                <ol list-style-type="decimal">
+                    <li>CCC</li>
+                </ol>
+            </li>
+        </ol>
+    '''
+
+    def get_xml(self):
+        li_text = [
+            ('AAA', 0, 2),
+            ('BBB', 1, 1),
+            ('CCC', 0, 1),
+        ]
+        lis = ''
+        for text, ilvl, numId in li_text:
+            lis += DXB.li(text=text, ilvl=ilvl, numId=numId)
+
+        xml = DXB.xml(lis)
+        return xml
+
+
+class SeperateListsTestCase(_TranslationTestCase):
+    expected_output = '''
+        <ol list-style-type="lowerLetter">
+            <li>AAA</li>
+        </ol>
+        <ol list-style-type="decimal">
+            <li>BBB</li>
+        </ol>
+        <ol list-style-type="lowerLetter">
+            <li>CCC</li>
+        </ol>
+    '''
+
+    def get_xml(self):
+        li_text = [
+            ('AAA', 0, 2),
+            # Because AAA and CCC are part of the same list (same list id)
+            # and BBB is different, these need to be split into three
+            # lists (or lose everything from BBB and after.
+            ('BBB', 0, 1),
+            ('CCC', 0, 2),
+        ]
+        lis = ''
+        for text, ilvl, numId in li_text:
+            lis += DXB.li(text=text, ilvl=ilvl, numId=numId)
+
+        xml = DXB.xml(lis)
+        return xml
+
+
+class InvalidIlvlOrderTestCase(_TranslationTestCase):
+    expected_output = '''
+        <ol list-style-type="decimal">
+            <li>AAA
+                <ol list-style-type="decimal">
+                    <li>BBB
+                        <ol list-style-type="decimal">
+                            <li>CCC</li>
+                        </ol>
+                    </li>
+                </ol>
+            </li>
+        </ol>
+    '''
+
+    def get_xml(self):
+        tags = [
+            DXB.li(text='AAA', ilvl=1, numId=1),
+            DXB.li(text='BBB', ilvl=3, numId=1),
+            DXB.li(text='CCC', ilvl=2, numId=1),
+        ]
+        body = ''
+        for el in tags:
+            body += el
+
+        xml = DXB.xml(body)
+        return xml
+
+
+class DeeplyNestedTableTestCase(_TranslationTestCase):
+    expected_output = ''
+    run_expected_output = False
+
+    def get_xml(self):
+        paragraph = DXB.p_tag('AAA')
+
+        for _ in range(1000):
+            cell = DXB.table_cell(paragraph)
+            row = DXB.table_cell([cell])
+            table = DXB.table([row])
+        body = table
+        xml = DXB.xml(body)
+        return xml
+
+    def test_performance(self):
+        with self.toggle_run_expected_output():
+            start_time = time.time()
+            try:
+                self.test_expected_output()
+            except AssertionError:
+                pass
+            end_time = time.time()
+            total_time = end_time - start_time
+            # This finishes in under a second on python 2.7
+            assert total_time < 3, total_time
+
+
+class LargeCellTestCase(_TranslationTestCase):
+    expected_output = ''
+    run_expected_output = False
+
+    def get_xml(self):
+        # Make sure it is over 1000 (which is the recursion limit)
+        paragraphs = [DXB.p_tag('%d' % i) for i in range(1000)]
+        cell = DXB.table_cell(paragraphs)
+        row = DXB.table_cell([cell])
+        table = DXB.table([row])
+        body = table
+        xml = DXB.xml(body)
+        return xml
+
+    def test_performance(self):
+        with self.toggle_run_expected_output():
+            start_time = time.time()
+            try:
+                self.test_expected_output()
+            except AssertionError:
+                pass
+            end_time = time.time()
+            total_time = end_time - start_time
+            # This finishes in under a second on python 2.7
+            assert total_time < 3, total_time
+
+
+class NonStandardTextTagsTestCase(_TranslationTestCase):
+    expected_output = '''
+        <p><span class='pydocx-insert'>insert </span>
+        smarttag</p>
+    '''
+
+    def get_xml(self):
+        run_tags = [DXB.r_tag([DXB.t_tag(i)]) for i in 'insert ']
+        insert_tag = DXB.insert_tag(run_tags)
+        run_tags = [DXB.r_tag([DXB.t_tag(i)]) for i in 'smarttag']
+        smart_tag = DXB.smart_tag(run_tags)
+
+        run_tags = [insert_tag, smart_tag]
+        body = DXB.p_tag(run_tags)
+        xml = DXB.xml(body)
+        return xml
+
+
+class RTagWithNoText(_TranslationTestCase):
+    expected_output = ''
+
+    def get_xml(self):
+        p_tag = DXB.p_tag(None)  # No text
+        run_tags = [p_tag]
+        # The bug is only present in a hyperlink
+        run_tags = [DXB.hyperlink_tag(r_id='rId0', run_tags=run_tags)]
+        body = DXB.p_tag(run_tags)
+
+        xml = DXB.xml(body)
+        return xml
+
+
+class DeleteTagInList(_TranslationTestCase):
+    expected_output = '''
+        <ol list-style-type="decimal">
+            <li>AAA
+                <span class='pydocx-delete'>BBB</span>
+            </li>
+            <li>CCC</li>
+        </ol>
+    '''
+
+    def get_xml(self):
+        delete_tags = DXB.delete_tag(['BBB'])
+        p_tag = DXB.p_tag([delete_tags])
+
+        body = DXB.li(text='AAA', ilvl=0, numId=0)
+        body += p_tag
+        body += DXB.li(text='CCC', ilvl=0, numId=0)
+
+        xml = DXB.xml(body)
+        return xml
+
+
+class InsertTagInList(_TranslationTestCase):
+    expected_output = '''
+        <ol list-style-type="decimal">
+            <li>AAA<span class='pydocx-insert'>BBB</span>
+            </li>
+            <li>CCC</li>
+        </ol>
+    '''
+
+    def get_xml(self):
+        run_tags = [DXB.r_tag([DXB.t_tag(i)]) for i in 'BBB']
+        insert_tags = DXB.insert_tag(run_tags)
+        p_tag = DXB.p_tag([insert_tags])
+
+        body = DXB.li(text='AAA', ilvl=0, numId=0)
+        body += p_tag
+        body += DXB.li(text='CCC', ilvl=0, numId=0)
+
+        xml = DXB.xml(body)
+        return xml
+
+
+class SmartTagInList(_TranslationTestCase):
+    expected_output = '''
+        <ol list-style-type="decimal">
+            <li>AAABBB
+            </li>
+            <li>CCC</li>
+        </ol>
+    '''
+
+    def get_xml(self):
+        run_tags = [DXB.r_tag([DXB.t_tag(i)]) for i in 'BBB']
+        smart_tag = DXB.smart_tag(run_tags)
+        p_tag = DXB.p_tag([smart_tag])
+
+        body = DXB.li(text='AAA', ilvl=0, numId=0)
+        body += p_tag
+        body += DXB.li(text='CCC', ilvl=0, numId=0)
+
+        xml = DXB.xml(body)
+        return xml
+
+
+class SingleListItem(_TranslationTestCase):
+    expected_output = '''
+        <ol list-style-type="lowerLetter">
+            <li>AAA</li>
+        </ol>
+        <p>BBB</p>
+    '''
+
+    numbering_dict = {
+        '1': {
+            '0': 'lowerLetter',
+        }
+    }
+
+    def get_xml(self):
+        li = DXB.li(text='AAA', ilvl=0, numId=1)
+        p_tags = [
+            DXB.p_tag('BBB'),
+        ]
+        body = li
+        for p_tag in p_tags:
+            body += p_tag
+        xml = DXB.xml(body)
+        return xml
+
+
+class SimpleTableTest(_TranslationTestCase):
+    expected_output = '''
+        <table border="1">
+            <tr>
+                <td>Blank</td>
+                <td>Column 1</td>
+                <td>Column 2</td>
+            </tr>
+            <tr>
+                <td>Row 1</td>
+                <td>First</td>
+                <td>Second</td>
+            </tr>
+            <tr>
+                <td>Row 2</td>
+                <td>Third</td>
+                <td>Fourth</td>
+            </tr>
+        </table>'''
+
+    def get_xml(self):
+        cell1 = DXB.table_cell(paragraph=DXB.p_tag('Blank'))
+        cell2 = DXB.table_cell(paragraph=DXB.p_tag('Row 1'))
+        cell3 = DXB.table_cell(paragraph=DXB.p_tag('Row 2'))
+        cell4 = DXB.table_cell(paragraph=DXB.p_tag('Column 1'))
+        cell5 = DXB.table_cell(paragraph=DXB.p_tag('First'))
+        cell6 = DXB.table_cell(paragraph=DXB.p_tag('Third'))
+        cell7 = DXB.table_cell(paragraph=DXB.p_tag('Column 2'))
+        cell8 = DXB.table_cell(paragraph=DXB.p_tag('Second'))
+        cell9 = DXB.table_cell(paragraph=DXB.p_tag('Fourth'))
+        rows = [DXB.table_row([cell1, cell4, cell7]),
+                DXB.table_row([cell2, cell5, cell8]),
+                DXB.table_row([cell3, cell6, cell9])]
+        table = DXB.table(rows)
+        body = table
+        xml = DXB.xml(body)
+        return xml
+
+
+class MissingIlvl(_TranslationTestCase):
+    expected_output = '''
+        <ol list-style-type="decimal">
+            <li>AAA<br />
+                BBB
+            </li>
+            <li>CCC</li>
+        </ol>
+    '''
+
+    def get_xml(self):
+        li_text = [
+            ('AAA', 0, 1),
+            ('BBB', None, 1),  # Because why not.
+            ('CCC', 0, 1),
+        ]
+        lis = ''
+        for text, ilvl, numId in li_text:
+            lis += DXB.li(text=text, ilvl=ilvl, numId=numId)
+        body = lis
+        xml = DXB.xml(body)
+        return xml
+
+
+class SameNumIdInTable(_TranslationTestCase):
+    expected_output = '''
+        <ol list-style-type="lowerLetter">
+            <li>AAA
+                <table border="1">
+                    <tr>
+                        <td>
+                            <ol list-style-type="lowerLetter">
+                                <li>BBB</li>
+                            </ol>
+                        </td>
+                    </tr>
+                </table>
+            </li>
+            <li>CCC</li>
+        </ol>
+    '''
+
+    # Ensure its not failing somewhere and falling back to decimal
+    numbering_dict = {
+        '1': {
+            '0': 'lowerLetter',
+        }
+    }
+
+    def get_xml(self):
+        li_text = [
+            ('BBB', 0, 1),
+        ]
+        lis = ''
+        for text, ilvl, numId in li_text:
+            lis += DXB.li(text=text, ilvl=ilvl, numId=numId)
+        cell1 = DXB.table_cell(lis)
+        rows = DXB.table_row([cell1])
+        table = DXB.table([rows])
+        lis = ''
+        lis += DXB.li(text='AAA', ilvl=0, numId=1)
+        lis += table
+        lis += DXB.li(text='CCC', ilvl=0, numId=1)
+        body = lis
+        xml = DXB.xml(body)
+        return xml
+
+
+class SDTTestCase(_TranslationTestCase):
+    expected_output = '''
+        <ol list-style-type="decimal">
+            <li>AAABBB
+            </li>
+            <li>CCC</li>
+        </ol>
+    '''
+
+    def get_xml(self):
+        body = ''
+        body += DXB.li(text='AAA', ilvl=0, numId=0)
+        body += DXB.sdt_tag(p_tag=DXB.p_tag(text='BBB'))
+        body += DXB.li(text='CCC', ilvl=0, numId=0)
+
+        xml = DXB.xml(body)
+        return xml
+
+
+class HeadingTestCase(_TranslationTestCase):
+    expected_output = '''
+        <h1>AAA</h1>
+        <h2>BBB</h2>
+        <h3>CCC</h3>
+        <h4>DDD</h4>
+        <h5>EEE</h5>
+        <h6>GGG</h6>
+        <p>HHH</p>
+    '''
+
+    styles_dict = {
+        'style0': {
+            'style_name': 'heading 1',
+        },
+        'style1': {
+            'style_name': 'heading 2',
+        },
+        'style2': {
+            'style_name': 'heading 3',
+        },
+        'style3': {
+            'style_name': 'heading 4',
+        },
+        'style4': {
+            'style_name': 'heading 5',
+        },
+        'style5': {
+            'style_name': 'heading 6',
+        },
+    }
+
+    def get_xml(self):
+        p_tags = [
+            DXB.p_tag(text='AAA', style='style0'),
+            DXB.p_tag(text='BBB', style='style1'),
+            DXB.p_tag(text='CCC', style='style2'),
+            DXB.p_tag(text='DDD', style='style3'),
+            DXB.p_tag(text='EEE', style='style4'),
+            DXB.p_tag(text='GGG', style='style5'),
+            DXB.p_tag(text='HHH', style='garbage'),
+        ]
+        body = ''
+        for tag in p_tags:
+            body += tag
+
+        xml = DXB.xml(body)
+        return xml
+
+
+class StyledBoldingTestCase(_TranslationTestCase):
+    expected_output = '''
+        <p><strong>AAA</strong></p>
+        <p><strong>BBB</strong></p>
+        <p>CCC</p>
+    '''
+
+    styles_dict = {
+        'style0': {
+            'style_name': 'p1',
+            'default_run_properties': {
+                'b': '',
+            }
+        },
+    }
+
+    def get_xml(self):
+        p_tags = [
+            DXB.p_tag(text='AAA', style='style0'),
+            DXB.p_tag(
+                [
+                    DXB.r_tag(
+                        [DXB.t_tag('BBB')],
+                        # Don't do duplicates
+                        rpr=DXB.rpr_tag({'b': None}),
+                    ),
+                ],
+                style='style0',
+            ),
+            DXB.p_tag(
+                [
+                    DXB.r_tag(
+                        [DXB.t_tag('CCC')],
+                        # Overwrite the current style
+                        rpr=DXB.rpr_tag({'b': 'false'}),
+                    ),
+                ],
+                style='style0',
+            ),
+        ]
+        body = ''
+        for tag in p_tags:
+            body += tag
+
+        xml = DXB.xml(body)
+        return xml
+
+
+class RomanNumeralToHeadingTestCase(_TranslationTestCase):
+    convert_root_level_upper_roman = True
+    numbering_dict = {
+        '1': {
+            '0': 'upperRoman',
+            '1': 'decimal',
+            '2': 'upperRoman',
+        },
+        '2': {
+            '0': 'upperRoman',
+            '1': 'decimal',
+            '2': 'upperRoman',
+        },
+        '3': {
+            '0': 'upperRoman',
+            '1': 'decimal',
+            '2': 'upperRoman',
+        },
+    }
+    expected_output = '''
+        <h2>AAA</h2>
+        <ol list-style-type="decimal">
+            <li>BBB</li>
+        </ol>
+        <h2>CCC</h2>
+        <ol list-style-type="decimal">
+            <li>DDD</li>
+        </ol>
+        <h2>EEE</h2>
+        <ol list-style-type="decimal">
+            <li>FFF
+                <ol list-style-type="upperRoman">
+                    <li>GGG</li>
+                </ol>
+            </li>
+        </ol>
+    '''
+
+    def get_xml(self):
+        li_text = [
+            ('AAA', 0, 1),
+            ('BBB', 1, 1),
+            ('CCC', 0, 2),
+            ('DDD', 1, 2),
+            ('EEE', 0, 3),
+            ('FFF', 1, 3),
+            ('GGG', 2, 3),
+        ]
+        body = ''
+        for text, ilvl, numId in li_text:
+            body += DXB.li(text=text, ilvl=ilvl, numId=numId)
+
+        xml = DXB.xml(body)
+        return xml
+
+
+class MultipleTTagsInRTag(_TranslationTestCase):
+    expected_output = '''
+        <p>ABC</p>
+    '''
+
+    def get_xml(self):
+        r_tag = DXB.r_tag(
+            [DXB.t_tag(letter) for letter in 'ABC'],
+        )
+        p_tag = DXB.p_tag(
+            [r_tag],
+            jc='start',
+        )
+        body = p_tag
+
+        xml = DXB.xml(body)
+        return xml
+
+
+class SuperAndSubScripts(_TranslationTestCase):
+    expected_output = '''
+        <p>AAA<sup>BBB</sup></p>
+        <p><sub>CCC</sub>DDD</p>
+    '''
+
+    def get_xml(self):
+        p_tags = [
+            DXB.p_tag(
+                [
+                    DXB.r_tag([DXB.t_tag('AAA')]),
+                    DXB.r_tag(
+                        [DXB.t_tag('BBB')],
+                        rpr=DXB.rpr_tag({'vertAlign': 'superscript'}),
+                    ),
+                ],
+            ),
+            DXB.p_tag(
+                [
+                    DXB.r_tag(
+                        [DXB.t_tag('CCC')],
+                        rpr=DXB.rpr_tag({'vertAlign': 'subscript'}),
+                    ),
+                    DXB.r_tag([DXB.t_tag('DDD')]),
+                ],
+            ),
+        ]
+        body = ''
+        for p_tag in p_tags:
+            body += p_tag
+
+        xml = DXB.xml(body)
+        return xml
+
+
+class AvaliableInlineTags(_TranslationTestCase):
+    expected_output = '''
+        <p><strong>aaa</strong></p>
+        <p><span class="pydocx-underline">bbb</span></p>
+        <p><em>ccc</em></p>
+        <p><span class="pydocx-caps">ddd</span></p>
+        <p><span class="pydocx-small-caps">eee</span></p>
+        <p><span class="pydocx-strike">fff</span></p>
+        <p><span class="pydocx-strike">ggg</span></p>
+        <p><span class="pydocx-hidden">hhh</span></p>
+        <p><span class="pydocx-hidden">iii</span></p>
+        <p><sup>jjj</sup></p>
+    '''
+
+    def get_xml(self):
+        p_tags = [
+            DXB.p_tag(
+                [
+                    DXB.r_tag(
+                        [DXB.t_tag('aaa')],
+                        rpr=DXB.rpr_tag({'b': None}),
+                    ),
+                ],
+            ),
+            DXB.p_tag(
+                [
+                    DXB.r_tag(
+                        [DXB.t_tag('bbb')],
+                        rpr=DXB.rpr_tag({'u': None}),
+                    ),
+                ],
+            ),
+            DXB.p_tag(
+                [
+                    DXB.r_tag(
+                        [DXB.t_tag('ccc')],
+                        rpr=DXB.rpr_tag({'i': None}),
+                    ),
+                ],
+            ),
+            DXB.p_tag(
+                [
+                    DXB.r_tag(
+                        [DXB.t_tag('ddd')],
+                        rpr=DXB.rpr_tag({'caps': None}),
+                    ),
+                ],
+            ),
+            DXB.p_tag(
+                [
+                    DXB.r_tag(
+                        [DXB.t_tag('eee')],
+                        rpr=DXB.rpr_tag({'smallCaps': None}),
+                    ),
+                ],
+            ),
+            DXB.p_tag(
+                [
+                    DXB.r_tag(
+                        [DXB.t_tag('fff')],
+                        rpr=DXB.rpr_tag({'strike': None})
+                    ),
+                ],
+            ),
+            DXB.p_tag(
+                [
+                    DXB.r_tag(
+                        [DXB.t_tag('ggg')],
+                        rpr=DXB.rpr_tag({'dstrike': None}),
+                    ),
+                ],
+            ),
+            DXB.p_tag(
+                [
+                    DXB.r_tag(
+                        [DXB.t_tag('hhh')],
+                        rpr=DXB.rpr_tag({'vanish': None})
+                    ),
+                ],
+            ),
+            DXB.p_tag(
+                [
+                    DXB.r_tag(
+                        [DXB.t_tag('iii')],
+                        rpr=DXB.rpr_tag({'webHidden': None}),
+                    ),
+                ],
+            ),
+            DXB.p_tag(
+                [
+                    DXB.r_tag(
+                        [DXB.t_tag('jjj')],
+                        rpr=DXB.rpr_tag({'vertAlign': 'superscript'}),
+                    ),
+                ],
+            ),
+        ]
+        body = ''
+        for p_tag in p_tags:
+            body += p_tag
+
+        xml = DXB.xml(body)
+        return xml
+
+
+class UnicodeTestCase(_TranslationTestCase):
+    expected_output = u"""
+        <p>\U0010001f</p>
+    """
+
+    def get_xml(self):
+        tags = [
+            DXB.p_tag(
+                [
+                    DXB.r_tag(
+                        [DXB.t_tag(r'&#x10001F;')],
+                    ),
+                ],
+            ),
+        ]
+
+        body = ''
+        for tag in tags:
+            body += tag
+        xml = DXB.xml(body)
+        return xml.encode('utf-8')
+
+
+class NoTextInTTagTestCase(_TranslationTestCase):
+    expected_output = u"""
+    """
+
+    def get_xml(self):
+        tags = [
+            DXB.p_tag(
+                [
+                    DXB.r_tag(
+                        [DXB.t_tag(None)],
+                    ),
+                ],
+            ),
+        ]
+
+        body = ''
+        for tag in tags:
+            body += tag
+        xml = DXB.xml(body)
+        return xml.encode('utf-8')
diff --git a/pydocx/utils.py b/pydocx/utils.py
new file mode 100644
index 00000000..1323302b
--- /dev/null
+++ b/pydocx/utils.py
@@ -0,0 +1,495 @@
+import re
+import collections
+
+from collections import defaultdict
+from xml.etree import cElementTree
+
+from pydocx.exceptions import MalformedDocxException
+
+
+UPPER_ROMAN_TO_HEADING_VALUE = 'h2'
+TAGS_CONTAINING_CONTENT = (
+    't',
+    'pict',
+    'drawing',
+    'delText',
+    'ins',
+)
+TAGS_HOLDING_CONTENT_TAGS = (
+    'p',
+    'tbl',
+    'sdt',
+)
+
+
+class MulitMemoize(object):
+    '''
+    Adapted from: https://wiki.python.org/moin/PythonDecoratorLibrary#Memoize
+    func_names = {
+        'find_all': find_all,
+        ...
+    }
+    '''
+    def __init__(self, func_names):
+        self.cache = dict((func_name, {}) for func_name in func_names)
+        self.func_names = func_names
+
+    def __call__(self, func_name, *args):
+        if not isinstance(args, collections.Hashable):
+            # uncacheable. a list, for instance.
+            # better to not cache than blow up.
+            return self.func_names[func_name](*args)
+        if args in self.cache[func_name]:
+            return self.cache[func_name][args]
+        else:
+            value = self.func_names[func_name](*args)
+            self.cache[func_name][args] = value
+            return value
+
+
+class MulitMemoizeMixin(object):
+    def __init__(self, *args, **kwargs):
+        super(MulitMemoizeMixin, self).__init__(*args, **kwargs)
+        self._memoization = None
+
+    def memod_tree_op(self, func_name, *args):
+        return self._memoization(func_name, *args)
+
+    def populate_memoization(self, func_names):
+        self._memoization = MulitMemoize(func_names)
+
+
+def el_iter(el):
+    """
+    Go through all elements
+    """
+    try:
+        return el.iter()
+    except AttributeError:
+        return el.findall('.//*')
+
+
+def find_first(el, tag):
+    """
+    Find the first occurrence of a tag beneath the current element.
+    """
+    return el.find('.//' + tag)
+
+
+def find_all(el, tag):
+    """
+    Find all occurrences of a tag
+    """
+    return el.findall('.//' + tag)
+
+
+def find_ancestor_with_tag(pre_processor, el, tag):
+    """
+    Find the first ancestor with that is a `tag`.
+    """
+    while pre_processor.parent(el) is not None:
+        el = pre_processor.parent(el)
+        if el.tag == tag:
+            return el
+    return None
+
+
+def has_descendant_with_tag(el, tag):
+    """
+    Determine if there is a child ahead in the element tree.
+    """
+    # Get child. stop at first child.
+    return True if find_first(el, tag) is not None else False
+
+
+def _filter_children(element, tags):
+    return [
+        el for el in element.getchildren()
+        if el.tag in tags
+    ]
+
+
+def remove_namespaces(document):
+    """
+    >>> exception_raised = False
+    >>> try:
+    ...     remove_namespaces('junk')
+    ... except MalformedDocxException:
+    ...     exception_raised = True
+    >>> assert exception_raised
+    """
+    encoding_regex = re.compile(
+        r'<\?xml.*encoding="(.+?)"',
+        re.DOTALL | re.MULTILINE,
+    )
+    encoding = 'us-ascii'
+    m = encoding_regex.match(document)
+    if m:
+        encoding = m.groups(0)[0]
+    try:
+        root = cElementTree.fromstring(document)
+    except SyntaxError:
+        raise MalformedDocxException('This document cannot be converted.')
+    for child in el_iter(root):
+        child.tag = child.tag.split("}")[1]
+        child.attrib = dict(
+            (k.split("}")[-1], v)
+            for k, v in child.attrib.items()
+        )
+    return cElementTree.tostring(root, encoding=encoding)
+
+
+def get_list_style(numbering_root, num_id, ilvl):
+    # This is needed on both the custom lxml parser and the pydocx parser. So
+    # make it a function.
+    ids = find_all(numbering_root, 'num')
+    for _id in ids:
+        if _id.attrib['numId'] != num_id:
+            continue
+        abstractid = _id.find('abstractNumId')
+        abstractid = abstractid.attrib['val']
+        style_information = find_all(
+            numbering_root,
+            'abstractNum',
+        )
+        for info in style_information:
+            if info.attrib['abstractNumId'] == abstractid:
+                for i in el_iter(info):
+                    if (
+                            'ilvl' in i.attrib and
+                            i.attrib['ilvl'] != ilvl):
+                        continue
+                    if i.find('numFmt') is not None:
+                        return i.find('numFmt').attrib['val']
+
+
+class NamespacedNumId(object):
+    def __init__(self, num_id, num_tables, *args, **kwargs):
+        self._num_id = num_id
+        self._num_tables = num_tables
+
+    def __unicode__(self, *args, **kwargs):
+        return '%s:%d' % (
+            self._num_id,
+            self._num_tables,
+        )
+
+    def __repr__(self, *args, **kwargs):
+        return self.__unicode__(*args, **kwargs)
+
+    def __eq__(self, other):
+        if other is None:
+            return False
+        return repr(self) == repr(other)
+
+    def __ne__(self, other):
+        if other is None:
+            return False
+        return repr(self) != repr(other)
+
+    @property
+    def num_id(self):
+        return self._num_id
+
+
+class PydocxPreProcessor(MulitMemoizeMixin):
+    def __init__(
+            self,
+            convert_root_level_upper_roman=False,
+            styles_dict=None,
+            numbering_root=None,
+            *args, **kwargs):
+        self.meta_data = defaultdict(dict)
+        self.convert_root_level_upper_roman = convert_root_level_upper_roman
+        self.styles_dict = styles_dict
+        self.numbering_root = numbering_root
+
+    def perform_pre_processing(self, root, *args, **kwargs):
+        self.populate_memoization({
+            'find_first': find_first,
+        })
+        self._add_parent(root)
+        # If we don't have a numbering root there cannot be any lists.
+        if self.numbering_root is not None:
+            self._set_list_attributes(root)
+        self._set_table_attributes(root)
+        self._set_is_in_table(root)
+
+        body = find_first(root, 'body')
+        p_elements = [
+            child for child in find_all(body, 'p')
+        ]
+        list_elements = [
+            child for child in p_elements
+            if self.is_list_item(child)
+        ]
+        # Find the first and last li elements
+        num_ids = set([self.num_id(i) for i in list_elements])
+        ilvls = set([self.ilvl(i) for i in list_elements])
+        self._set_first_list_item(num_ids, ilvls, list_elements)
+        self._set_last_list_item(num_ids, list_elements)
+
+        self._set_headers(p_elements)
+        self._convert_upper_roman(body)
+        self._set_next(body)
+
+    def is_first_list_item(self, el):
+        return self.meta_data[el].get('is_first_list_item', False)
+
+    def is_last_list_item_in_root(self, el):
+        return self.meta_data[el].get('is_last_list_item_in_root', False)
+
+    def is_list_item(self, el):
+        return self.meta_data[el].get('is_list_item', False)
+
+    def num_id(self, el):
+        if not self.is_list_item(el):
+            return None
+        return self.meta_data[el].get('num_id')
+
+    def ilvl(self, el):
+        if not self.is_list_item(el):
+            return None
+        return self.meta_data[el].get('ilvl')
+
+    def heading_level(self, el):
+        return self.meta_data[el].get('heading_level')
+
+    def is_in_table(self, el):
+        return self.meta_data[el].get('is_in_table')
+
+    def row_index(self, el):
+        return self.meta_data[el].get('row_index')
+
+    def column_index(self, el):
+        return self.meta_data[el].get('column_index')
+
+    def vmerge_continue(self, el):
+        return self.meta_data[el].get('vmerge_continue')
+
+    def next(self, el):
+        if el not in self.meta_data:
+            return
+        return self.meta_data[el].get('next')
+
+    def previous(self, el):
+        if el not in self.meta_data:
+            return
+        return self.meta_data[el].get('previous')
+
+    def parent(self, el):
+        return self.meta_data[el].get('parent')
+
+    def _add_parent(self, el):  # if a parent, make that an attribute
+        for child in el.getchildren():
+            self.meta_data[child]['parent'] = el
+            self._add_parent(child)
+
+    def _set_list_attributes(self, el):
+        list_elements = find_all(el, 'numId')
+        for li in list_elements:
+            parent = find_ancestor_with_tag(self, li, 'p')
+            # Deleted text in a list will have a numId but no ilvl.
+            if parent is None:
+                continue
+            parent_ilvl = self.memod_tree_op('find_first', parent, 'ilvl')
+            if parent_ilvl is None:
+                continue
+            self.meta_data[parent]['is_list_item'] = True
+            self.meta_data[parent]['num_id'] = self._generate_num_id(parent)
+            self.meta_data[parent]['ilvl'] = parent_ilvl.attrib['val']
+
+    def _generate_num_id(self, el):
+        '''
+        Fun fact: It is possible to have a list in the root, that holds a table
+        that holds a list and for both lists to have the same numId. When this
+        happens we should namespace the nested list with the number of tables
+        it is in to ensure it is considered a new list. Otherwise all sorts of
+        terrible html gets generated.
+        '''
+        num_id = find_first(el, 'numId').attrib['val']
+
+        # First, go up the parent until we get None and count the number of
+        # tables there are.
+        num_tables = 0
+        while self.parent(el) is not None:
+            if el.tag == 'tbl':
+                num_tables += 1
+            el = self.parent(el)
+        return NamespacedNumId(
+            num_id=num_id,
+            num_tables=num_tables,
+        )
+
+    def _set_first_list_item(self, num_ids, ilvls, list_elements):
+        # Lists are grouped by having the same `num_id` and `ilvl`. The first
+        # list item is the first list item found for each `num_id` and `ilvl`
+        # combination.
+        for num_id in num_ids:
+            for ilvl in ilvls:
+                filtered_list_elements = [
+                    i for i in list_elements
+                    if (
+                        self.num_id(i) == num_id and
+                        self.ilvl(i) == ilvl
+                    )
+                ]
+                if not filtered_list_elements:
+                    continue
+                first_el = filtered_list_elements[0]
+                self.meta_data[first_el]['is_first_list_item'] = True
+
+    def _set_last_list_item(self, num_ids, list_elements):
+        # Find last list elements. Only mark list tags as the last list tag if
+        # it is in the root of the document. This is only used to ensure that
+        # once a root level list is finished we do not roll in the rest of the
+        # non list elements into the first root level list.
+        for num_id in num_ids:
+            filtered_list_elements = [
+                i for i in list_elements
+                if self.num_id(i) == num_id
+            ]
+            if not filtered_list_elements:
+                continue
+            last_el = filtered_list_elements[-1]
+            self.meta_data[last_el]['is_last_list_item_in_root'] = True
+
+    def _set_table_attributes(self, el):
+        tables = find_all(el, 'tbl')
+        for table in tables:
+            rows = _filter_children(table, ['tr'])
+            if rows is None:
+                continue
+            for i, row in enumerate(rows):
+                tcs = _filter_children(row, ['tc'])
+                for j, child in enumerate(tcs):
+                    self.meta_data[child]['row_index'] = i
+                    self.meta_data[child]['column_index'] = j
+                    v_merge = find_first(child, 'vMerge')
+                    if (
+                            v_merge is not None and
+                            ('continue' == v_merge.get('val', '') or
+                             v_merge.attrib == {})
+                    ):
+                        self.meta_data[child]['vmerge_continue'] = True
+
+    def _set_is_in_table(self, el):
+        paragraph_elements = find_all(el, 'p')
+        for p in paragraph_elements:
+            if find_ancestor_with_tag(self, p, 'tc') is not None:
+                self.meta_data[p]['is_in_table'] = True
+
+    def _set_headers(self, elements):
+        # These are the styles for headers and what the html tag should be if
+        # we have one.
+        headers = {
+            'heading 1': 'h1',
+            'heading 2': 'h2',
+            'heading 3': 'h3',
+            'heading 4': 'h4',
+            'heading 5': 'h5',
+            'heading 6': 'h6',
+            'heading 7': 'h6',
+            'heading 8': 'h6',
+            'heading 9': 'h6',
+            'heading 10': 'h6',
+        }
+        # Remove the rPr from the styles dict since all the styling will be
+        # down with the heading.
+        for style_id, styles in self.styles_dict.items():
+            if styles.get('style_name', '').lower() in headers:
+                if 'default_run_properties' in styles:
+                    del styles['default_run_properties']
+
+        for element in elements:
+            # This element is using the default style which is not a heading.
+            p_style = find_first(element, 'pStyle')
+            if p_style is None:
+                continue
+            style = p_style.attrib.get('val', '')
+            metadata = self.styles_dict.get(style, {})
+            style_name = metadata.get('style_name')
+
+            # Check to see if this element is actually a header.
+            if style_name and style_name.lower() in headers:
+                # Set all the list item variables to false.
+                self.meta_data[element]['is_list_item'] = False
+                self.meta_data[element]['is_first_list_item'] = False
+                self.meta_data[element]['is_last_list_item_in_root'] = False
+                # Prime the heading_level
+                self.meta_data[element]['heading_level'] = headers[style_name.lower()]  # noqa
+
+    def _convert_upper_roman(self, body):
+        if not self.convert_root_level_upper_roman:
+            return
+        first_root_list_items = [
+            # Only root level elements.
+            el for el in body.getchildren()
+            # And only first_list_items
+            if self.is_first_list_item(el)
+        ]
+        visited_num_ids = []
+        all_p_tags_in_body = find_all(body, 'p')
+        for root_list_item in first_root_list_items:
+            if self.num_id(root_list_item) in visited_num_ids:
+                continue
+            visited_num_ids.append(self.num_id(root_list_item))
+            lst_style = get_list_style(
+                self.numbering_root,
+                self.num_id(root_list_item).num_id,
+                self.ilvl(root_list_item),
+            )
+            if lst_style != 'upperRoman':
+                continue
+            ilvl = min(
+                self.ilvl(el) for el in all_p_tags_in_body
+                if self.num_id(el) == self.num_id(root_list_item)
+            )
+            root_upper_roman_list_items = [
+                el for el in all_p_tags_in_body
+                if self.num_id(el) == self.num_id(root_list_item) and
+                self.ilvl(el) == ilvl
+            ]
+            for list_item in root_upper_roman_list_items:
+                self.meta_data[list_item]['is_list_item'] = False
+                self.meta_data[list_item]['is_first_list_item'] = False
+                self.meta_data[list_item]['is_last_list_item_in_root'] = False  # noqa
+
+                self.meta_data[list_item]['heading_level'] = UPPER_ROMAN_TO_HEADING_VALUE  # noqa
+
+    def _set_next(self, body):
+        def _get_children_with_content(el):
+            # We only care about children if they have text in them.
+            children = []
+            for child in _filter_children(el, TAGS_HOLDING_CONTENT_TAGS):
+                _has_descendant_with_tag = any(
+                    has_descendant_with_tag(child, tag) for
+                    tag in TAGS_CONTAINING_CONTENT
+                )
+                if _has_descendant_with_tag:
+                    children.append(child)
+            return children
+
+        def _assign_next(children):
+            # Populate the `next` attribute for all the child elements.
+            for i in range(len(children)):
+                try:
+                    if children[i + 1] is not None:
+                        self.meta_data[children[i]]['next'] = children[i + 1]  # noqa
+                except IndexError:
+                    pass
+                try:
+                    if children[i - 1] is not None:
+                        self.meta_data[children[i]]['previous'] = children[i - 1]  # noqa
+                except IndexError:
+                    pass
+        # Assign next for everything in the root.
+        _assign_next(_get_children_with_content(body))
+
+        # In addition set next for everything in table cells.
+        for tc in find_all(body, 'tc'):
+            _assign_next(_get_children_with_content(tc))
+
+
+def parse_xml_from_string(xml):
+    return cElementTree.fromstring(remove_namespaces(xml))
diff --git a/requirements.txt b/requirements.txt
index f9954ad0..77421ff8 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1 +1,4 @@
-beautifulsoup4>=4.1.0
+Jinja2>=2.0
+coverage==3.6
+nose==1.3.0
+flake8
diff --git a/run_tests.sh b/run_tests.sh
new file mode 100755
index 00000000..da46b811
--- /dev/null
+++ b/run_tests.sh
@@ -0,0 +1,4 @@
+#! /bin/sh
+
+nosetests --verbose --with-doctest --with-coverage --cover-package pydocx $@ &&
+find -name '*.py' | xargs flake8
diff --git a/setup.py b/setup.py
new file mode 100644
index 00000000..defe5013
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,62 @@
+#! /usr/bin/env python
+# -*- coding: utf-8 -*-
+
+import os
+
+try:
+    from setuptools import setup, find_packages
+except ImportError:
+    from ez_setup import use_setuptools
+    use_setuptools()
+    from setuptools import setup, find_packages  # noqa
+rel_file = lambda *args: os.path.join(
+    os.path.dirname(os.path.abspath(__file__)), *args)
+
+
+def get_file(filename):
+    with open(rel_file(filename)) as f:
+        return f.read()
+
+
+def get_description():
+    return get_file('README.rst') + get_file('CHANGELOG')
+
+setup(
+    name="PyDocX",
+    # Edit here and pydocx.__init__
+    version="0.3.13",
+    description="docx (OOXML) to html converter",
+    author="Jason Ward, Sam Portnow",
+    author_email="jason.louard.ward@gmail.com, samson91787@gmail.com",
+    url="http://github.com/CenterForOpenScience/pydocx",
+    platforms=["any"],
+    license="BSD",
+    packages=find_packages(),
+    package_data={
+        'pydocx': [
+            'tests/templates/*.xml',
+        ],
+    },
+    scripts=[],
+    zip_safe=False,
+    install_requires=[],
+    cmdclass={},
+    classifiers=[
+        "Development Status :: 3 - Alpha",
+        "Programming Language :: Python",
+        "Programming Language :: Python :: 2.6",
+        "Programming Language :: Python :: 2.7",
+        "Programming Language :: Python :: 2 :: Only",
+        "Intended Audience :: Developers",
+        "License :: OSI Approved :: BSD License",
+        "Operating System :: OS Independent",
+        "Topic :: Text Processing :: Markup :: HTML",
+        "Topic :: Text Processing :: Markup :: XML",
+    ],
+    long_description=get_description(),
+    entry_points={
+        'console_scripts': [
+            'pydocx = pydocx.__init__:main',
+        ],
+    },
+)