From 1c7362fd8bce89c4d9d6c2d776954511e05b59a2 Mon Sep 17 00:00:00 2001 From: Alex-Kennedy-Laptop Date: Thu, 21 Nov 2019 00:42:27 -0700 Subject: [PATCH 01/16] test pycharm git connection --- xml_parse_to_csv.py | 1 + 1 file changed, 1 insertion(+) create mode 100644 xml_parse_to_csv.py diff --git a/xml_parse_to_csv.py b/xml_parse_to_csv.py new file mode 100644 index 0000000..638eff2 --- /dev/null +++ b/xml_parse_to_csv.py @@ -0,0 +1 @@ +print("hello world!") \ No newline at end of file From e5195e0bc0c78d09ca4a4dffb439541c21d030f6 Mon Sep 17 00:00:00 2001 From: Alex-Kennedy-Laptop Date: Thu, 21 Nov 2019 01:47:42 -0700 Subject: [PATCH 02/16] Import XML from URL and test --- src/xml_parse_to_csv.py | 12 ++++++++++++ test/test_xml_to_csv.py | 31 +++++++++++++++++++++++++++++++ xml_parse_to_csv.py | 1 - 3 files changed, 43 insertions(+), 1 deletion(-) create mode 100644 src/xml_parse_to_csv.py create mode 100644 test/test_xml_to_csv.py delete mode 100644 xml_parse_to_csv.py diff --git a/src/xml_parse_to_csv.py b/src/xml_parse_to_csv.py new file mode 100644 index 0000000..a19cdcc --- /dev/null +++ b/src/xml_parse_to_csv.py @@ -0,0 +1,12 @@ +import xml.etree.ElementTree as ET +import urllib2 + + +def read_xml_to_tree(url): + """ + Read XML from URL into ElementTree object + :param url: XML file url + :return: ElementTree of XML object + """ + f_xml = urllib2.urlopen(url) + return ET.parse(f_xml) diff --git a/test/test_xml_to_csv.py b/test/test_xml_to_csv.py new file mode 100644 index 0000000..f271414 --- /dev/null +++ b/test/test_xml_to_csv.py @@ -0,0 +1,31 @@ +from unittest import TestCase +# TODO: imports for tests go local or at top? +from src.xml_parse_to_csv import read_xml_to_tree +import xml.etree.ElementTree as ET + +# class XmlTestCase(TestCase): +# def setUp(self): +# self.xml_tree + + +class TestImportXML(TestCase): + def setUp(self): + # read the xml from url to tree + url = 'https://www.w3schools.com/xml/cd_catalog.xml' + self.tree = read_xml_to_tree(url) + + def test_read_xml_to_tree(self): + root = self.tree.getroot() + + # check that a tree is returned + self.assertIsInstance(self.tree, ET.ElementTree) + + # check that the root is has tag catalog + self.assertTrue(root.tag == 'CATALOG') + + # check that there are X children + self.assertTrue(len(list(root)) == 26) + + # check that each of the children are parsing all of their children + for child in root: + self.assertTrue(len(list(child)) == 6) diff --git a/xml_parse_to_csv.py b/xml_parse_to_csv.py deleted file mode 100644 index 638eff2..0000000 --- a/xml_parse_to_csv.py +++ /dev/null @@ -1 +0,0 @@ -print("hello world!") \ No newline at end of file From 747df688b44fdb0006153e1ffc0dd598311e880b Mon Sep 17 00:00:00 2001 From: Alex-Kennedy-Laptop Date: Thu, 21 Nov 2019 02:58:43 -0700 Subject: [PATCH 03/16] implemented find all rec and test --- src/xml_parse_to_csv.py | 19 +++++++++++++++++++ test/test_xml_to_csv.py | 37 ++++++++++++++++++++++++------------- 2 files changed, 43 insertions(+), 13 deletions(-) diff --git a/src/xml_parse_to_csv.py b/src/xml_parse_to_csv.py index a19cdcc..817b6bf 100644 --- a/src/xml_parse_to_csv.py +++ b/src/xml_parse_to_csv.py @@ -10,3 +10,22 @@ def read_xml_to_tree(url): """ f_xml = urllib2.urlopen(url) return ET.parse(f_xml) + + +def find_all_rec(element, tag): + # TODO: could make this into a generator? + """ + Recursively find all elements with tag within element + :param element: Element to search + :param tag: Tag to search for + :return: List of all sub-elements that have the tag + """ + # Base Case: no children, leaf element + if not list(element): + return [element] if element.tag == tag else [] + + # Recursive Step: internet element / root element + matches = element.findall(tag) + for child in list(element): + matches += find_all_rec(child, tag) + return matches diff --git a/test/test_xml_to_csv.py b/test/test_xml_to_csv.py index f271414..c5a1868 100644 --- a/test/test_xml_to_csv.py +++ b/test/test_xml_to_csv.py @@ -1,11 +1,8 @@ from unittest import TestCase -# TODO: imports for tests go local or at top? -from src.xml_parse_to_csv import read_xml_to_tree +from copy import deepcopy import xml.etree.ElementTree as ET -# class XmlTestCase(TestCase): -# def setUp(self): -# self.xml_tree +from src.xml_parse_to_csv import read_xml_to_tree, find_all_rec class TestImportXML(TestCase): @@ -17,15 +14,29 @@ def setUp(self): def test_read_xml_to_tree(self): root = self.tree.getroot() - # check that a tree is returned + # basic xml tree structure inserts self.assertIsInstance(self.tree, ET.ElementTree) - - # check that the root is has tag catalog - self.assertTrue(root.tag == 'CATALOG') - - # check that there are X children - self.assertTrue(len(list(root)) == 26) + self.assertEqual(root.tag, 'CATALOG') + self.assertEqual(len(list(root)), 26) # check that each of the children are parsing all of their children for child in root: - self.assertTrue(len(list(child)) == 6) + self.assertEqual(len(list(child)), 6) + + def test_find_all_rec(self): + # create an instance of the tree with duplicate tags in tree for testing find all rec method + tree_with_duplicate_tag = deepcopy(self.tree) + root = tree_with_duplicate_tag.getroot() + + # delete all children except the first one + children = list(root) + for i in range(1, len(children)): + root.remove(children[i]) + + # add a copy of the only child of tree to each grandchild + add_to = list(root)[0] + to_add = deepcopy(add_to) + for child in add_to: + child.append(to_add) + + self.assertEqual(len(find_all_rec(root, "CD")), 7) From d14f5b0a2be0be28c42f1d708af8e68394fe0f87 Mon Sep 17 00:00:00 2001 From: Alex-Kennedy-Laptop Date: Thu, 21 Nov 2019 04:06:39 -0700 Subject: [PATCH 04/16] Built structure for extract_element and its tests --- src/__init__.py | 1 + src/xml_parse_to_csv.py | 26 ++++++++++++++++ test/test_xml_to_csv.py | 68 +++++++++++++++++++++++++++++++++-------- 3 files changed, 83 insertions(+), 12 deletions(-) create mode 100644 src/__init__.py diff --git a/src/__init__.py b/src/__init__.py new file mode 100644 index 0000000..9108c5b --- /dev/null +++ b/src/__init__.py @@ -0,0 +1 @@ +from xml_parse_to_csv import read_xml_to_tree, find_all_rec, extract_element_basic \ No newline at end of file diff --git a/src/xml_parse_to_csv.py b/src/xml_parse_to_csv.py index 817b6bf..ded458d 100644 --- a/src/xml_parse_to_csv.py +++ b/src/xml_parse_to_csv.py @@ -29,3 +29,29 @@ def find_all_rec(element, tag): for child in list(element): matches += find_all_rec(child, tag) return matches + + +# TODO: document exceptions for extract element +# TODO: add more element types that we can extract besides basic and list +def extract_element_basic(element, tag): + """ + Extract a basic type (text field, leaf node) sub-element from parent with given tag + + :param element: Parent element to extract from + :param tag: Tag of element to extract + :return: Extracted element as a string + """ + pass + + +def extract_element_list(element, list_tag, list_element_tag): + """ + Extract a list type (internal node, all children leaf nodes) sub-element from parent with given list_tag and + elements having list_element_tag + + :param element: parent element + :param list_tag: + :param list_element_tag: + :return: Extracted element as a list + """ + pass diff --git a/test/test_xml_to_csv.py b/test/test_xml_to_csv.py index c5a1868..0e6ac76 100644 --- a/test/test_xml_to_csv.py +++ b/test/test_xml_to_csv.py @@ -2,14 +2,31 @@ from copy import deepcopy import xml.etree.ElementTree as ET -from src.xml_parse_to_csv import read_xml_to_tree, find_all_rec +from src import read_xml_to_tree, find_all_rec, extract_element_basic -class TestImportXML(TestCase): +class TestXMLToCSV(TestCase): + # TODO: TearDown def setUp(self): # read the xml from url to tree url = 'https://www.w3schools.com/xml/cd_catalog.xml' self.tree = read_xml_to_tree(url) + print(ET.tostring(self.tree.getroot())) + + # create an instance of the tree with duplicate tags in tree for testing find all rec method + self.tree_with_duplicate_tag = deepcopy(self.tree) + root = self.tree_with_duplicate_tag.getroot() + + # delete all children except the first one + children = list(root) + for i in range(1, len(children)): + root.remove(children[i]) + + # add a copy of the only child of tree to each grandchild + add_to = list(root)[0] + to_add = deepcopy(add_to) + for child in add_to: + child.append(to_add) def test_read_xml_to_tree(self): root = self.tree.getroot() @@ -24,19 +41,46 @@ def test_read_xml_to_tree(self): self.assertEqual(len(list(child)), 6) def test_find_all_rec(self): - # create an instance of the tree with duplicate tags in tree for testing find all rec method - tree_with_duplicate_tag = deepcopy(self.tree) - root = tree_with_duplicate_tag.getroot() + self.assertEqual(len(find_all_rec(self.tree_with_duplicate_tag.getroot(), "CD")), 7) - # delete all children except the first one + def test_extract_element_basic_success(self): + # Success: Basic element (text field, leaf node) + + # create CD catalog with 1 CD for easier testing + ee_tree = deepcopy(self.tree) + root = ee_tree.getroot() children = list(root) for i in range(1, len(children)): root.remove(children[i]) - # add a copy of the only child of tree to each grandchild - add_to = list(root)[0] - to_add = deepcopy(add_to) - for child in add_to: - child.append(to_add) + root_tag, extract_tag = 'CD', 'Title' + elements = find_all_rec(root, root_tag)[0] # use the first CD in the list for testing + self.assertEqual(extract_element_basic(elements, extract_tag), "Empire Burlesque") + + +class TestExtractElement(TestCase): + # TODO: Tear Down + def setUp(self): + pass + + def test_extract_element_success(self): + # Tests success case of both extract element methods + self.fail() + + def test_extract_element_fail_not_exists(self): + # Tests failure case of both extract element methods + # Failure: Sub-element with tag does not exist within parent element + self.fail() + + def test_extract_element_fail_duplicate_key(self): + # Tests failure case of both extract element methods + # Failure: Multiple sub-elements found with tag within parent element + self.fail() + + def test_extract_element_basic_failure_not_basic(self): + # Failure: sub-element is not type basic (text field, leaf node) + self.fail() - self.assertEqual(len(find_all_rec(root, "CD")), 7) + def test_extract_element_failure_not_list(self): + # Failure: sub-element is not type list () + self.fail() From fe8dcdb8f6ae7779ae931a6c44e0bc7c1b54802c Mon Sep 17 00:00:00 2001 From: Alex-Kennedy-Laptop Date: Thu, 21 Nov 2019 04:34:53 -0700 Subject: [PATCH 05/16] Finish test cases for extract element methods --- test/test_xml_to_csv.py | 68 +++++++++++++++++++++++++++++------------ 1 file changed, 49 insertions(+), 19 deletions(-) diff --git a/test/test_xml_to_csv.py b/test/test_xml_to_csv.py index 0e6ac76..42aa88f 100644 --- a/test/test_xml_to_csv.py +++ b/test/test_xml_to_csv.py @@ -2,7 +2,8 @@ from copy import deepcopy import xml.etree.ElementTree as ET -from src import read_xml_to_tree, find_all_rec, extract_element_basic +# TODO: refactor all of these imports +from src import read_xml_to_tree, find_all_rec, extract_element_basic, extract_element_list class TestXMLToCSV(TestCase): @@ -43,44 +44,73 @@ def test_read_xml_to_tree(self): def test_find_all_rec(self): self.assertEqual(len(find_all_rec(self.tree_with_duplicate_tag.getroot(), "CD")), 7) - def test_extract_element_basic_success(self): - # Success: Basic element (text field, leaf node) +class TestExtractElement(TestCase): + # Sauce: https://stackoverflow.com/questions/8672754/how-to-show-the-error-messages-caught-by-assertraises-in-unittest-in-python2-7 + def assertRaisesWithMessage(self, msg, func, *args, **kwargs): + try: + func(*args, **kwargs) + self.assertFail() + except Exception as inst: + self.assertEqual(inst.message, msg) + + # TODO: Tear Down + def setUp(self): # create CD catalog with 1 CD for easier testing - ee_tree = deepcopy(self.tree) - root = ee_tree.getroot() + url = 'https://www.w3schools.com/xml/cd_catalog.xml' + self.tree = read_xml_to_tree(url) + root = self.tree.getroot() children = list(root) for i in range(1, len(children)): root.remove(children[i]) - root_tag, extract_tag = 'CD', 'Title' - elements = find_all_rec(root, root_tag)[0] # use the first CD in the list for testing - self.assertEqual(extract_element_basic(elements, extract_tag), "Empire Burlesque") + # create a list element for testing + to_add = ET.Element('SUBARTIST') + to_add.text = 'TEST' + add_to = find_all_rec(root, 'ARTIST')[0] + for i in range(3): + add_to.append(deepcopy(to_add)) - -class TestExtractElement(TestCase): - # TODO: Tear Down - def setUp(self): - pass + print(ET.tostring(root)) def test_extract_element_success(self): - # Tests success case of both extract element methods - self.fail() + parent = find_all_rec(self.tree.getroot(), 'CD')[0] + + # Basic + self.assertEqual(extract_element_basic(parent, 'TITLE'), 'Empire Burlesque') + + # List + compare = ['TEST', 'TEST', 'TEST'] + self.assertEqual(extract_element_list(parent, 'ARTIST', 'SUBARTIST')) def test_extract_element_fail_not_exists(self): # Tests failure case of both extract element methods # Failure: Sub-element with tag does not exist within parent element - self.fail() + parent = find_all_rec(self.tree.getroot(), 'CD')[0] + + # Basic + self.assertRaisesWithMessage('Element does not exist.', extract_element_basic, parent, 'NOT_THERE') + + # List + self.assertRaisesWithMessage('Element does not exist.', extract_element_list, parent, 'NOT_THERE') def test_extract_element_fail_duplicate_key(self): # Tests failure case of both extract element methods # Failure: Multiple sub-elements found with tag within parent element - self.fail() + parent = find_all_rec(self.tree.getroot(), 'CD')[0] + + # Basic + self.assertRaisesWithMessage('Duplicate extraction tag.', extract_element_basic, parent, 'SUBARTIST') + + # List + self.assertRaisesWithMessage('Duplicate extraction tag.', extract_element_list, parent, 'SUBARTIST') def test_extract_element_basic_failure_not_basic(self): # Failure: sub-element is not type basic (text field, leaf node) - self.fail() + parent = find_all_rec(self.tree.getroot(), 'CD')[0] + self.assertRaisesWithMessage('Duplicate extraction tag.', extract_element_basic, parent, 'ARTIST') def test_extract_element_failure_not_list(self): # Failure: sub-element is not type list () - self.fail() + parent = find_all_rec(self.tree.getroot(), 'CD')[0] + self.assertRaisesWithMessage('Duplicate extraction tag.', extract_element_list, parent, 'TITLE') From 190eded4ffc4188480e357143c9d71f7e44d35ab Mon Sep 17 00:00:00 2001 From: Alex-Kennedy-Laptop Date: Thu, 21 Nov 2019 05:31:21 -0700 Subject: [PATCH 06/16] Implemented extract element and tests. All test cases passed --- src/xml_parse_to_csv.py | 59 ++++++++++++++++++++++++++++++++++------- test/test_xml_to_csv.py | 32 +++++++++++++++------- 2 files changed, 73 insertions(+), 18 deletions(-) diff --git a/src/xml_parse_to_csv.py b/src/xml_parse_to_csv.py index ded458d..e7c7fe7 100644 --- a/src/xml_parse_to_csv.py +++ b/src/xml_parse_to_csv.py @@ -20,19 +20,18 @@ def find_all_rec(element, tag): :param tag: Tag to search for :return: List of all sub-elements that have the tag """ - # Base Case: no children, leaf element - if not list(element): - return [element] if element.tag == tag else [] - - # Recursive Step: internet element / root element - matches = element.findall(tag) + matches = [] for child in list(element): + if child.tag == tag: + matches += [child] matches += find_all_rec(child, tag) return matches # TODO: document exceptions for extract element # TODO: add more element types that we can extract besides basic and list +# TODO: custom exceptions for the failure cases +# TODO: better error messages for the failure cases def extract_element_basic(element, tag): """ Extract a basic type (text field, leaf node) sub-element from parent with given tag @@ -41,12 +40,21 @@ def extract_element_basic(element, tag): :param tag: Tag of element to extract :return: Extracted element as a string """ - pass + extract_elements = find_all_rec(element, tag) + validate_extract_element(extract_elements) # validate exists and non-duplicate + + # failure: element is not basic type + return_element = extract_elements[0] + if list(return_element): # if element has children, then not basic type + raise Exception('Element not basic type.') + + # success! + return return_element def extract_element_list(element, list_tag, list_element_tag): """ - Extract a list type (internal node, all children leaf nodes) sub-element from parent with given list_tag and + Extract a list type (internal node, all children basic type) sub-element from parent with given list_tag and elements having list_element_tag :param element: parent element @@ -54,4 +62,37 @@ def extract_element_list(element, list_tag, list_element_tag): :param list_element_tag: :return: Extracted element as a list """ - pass + extract_elements = find_all_rec(element, list_tag) + validate_extract_element(extract_elements) # validate exists and non-duplicate + + # failure: element is not list type + possible_list = extract_elements[0] + if not list(possible_list): # validate that the possible list element is not basic + raise Exception('Element not list type.') + + return_list = list(possible_list) + for ele in return_list: # validate that list element is basic type and ele tag matches + if ele.tag != list_element_tag or list(ele): + raise Exception('Element not list type.') + + # success! + return return_list + + + +def validate_extract_element(found_elements): + """ + Validate conditions for extract element: + Element must exist + Element must not have duplicate tag as another element + Helper method for extract_element_... methods + :param found_elements: Preliminary element list found by extract_element_... + :return: None + """ + # failure: element with tag doesn't exist + if not found_elements: + raise Exception('Element does not exist.') + + # failure: multiple elements with same tag + if len(found_elements) > 1: + raise Exception('Duplicate extraction tag.') diff --git a/test/test_xml_to_csv.py b/test/test_xml_to_csv.py index 42aa88f..8ee4432 100644 --- a/test/test_xml_to_csv.py +++ b/test/test_xml_to_csv.py @@ -3,9 +3,11 @@ import xml.etree.ElementTree as ET # TODO: refactor all of these imports -from src import read_xml_to_tree, find_all_rec, extract_element_basic, extract_element_list +import src +from src import read_xml_to_tree, find_all_rec, extract_element_basic, extract_element_list, validate_extract_element +# TODO: on assert equals, i got the expected and actual backwards for most of these class TestXMLToCSV(TestCase): # TODO: TearDown def setUp(self): @@ -42,9 +44,11 @@ def test_read_xml_to_tree(self): self.assertEqual(len(list(child)), 6) def test_find_all_rec(self): + self.assertEqual(len(find_all_rec(self.tree.getroot(), 'TITLE')), 26) self.assertEqual(len(find_all_rec(self.tree_with_duplicate_tag.getroot(), "CD")), 7) +# TODO: can refactor 'parent = find_all_rec(self.tree.getroot(), 'CD')[0]' into setup class TestExtractElement(TestCase): # Sauce: https://stackoverflow.com/questions/8672754/how-to-show-the-error-messages-caught-by-assertraises-in-unittest-in-python2-7 def assertRaisesWithMessage(self, msg, func, *args, **kwargs): @@ -71,17 +75,16 @@ def setUp(self): for i in range(3): add_to.append(deepcopy(to_add)) - print(ET.tostring(root)) + print(ET.tostring(self.tree.getroot())) def test_extract_element_success(self): parent = find_all_rec(self.tree.getroot(), 'CD')[0] - # Basic - self.assertEqual(extract_element_basic(parent, 'TITLE'), 'Empire Burlesque') + self.assertEqual(extract_element_basic(parent, 'TITLE').text, 'Empire Burlesque') # List compare = ['TEST', 'TEST', 'TEST'] - self.assertEqual(extract_element_list(parent, 'ARTIST', 'SUBARTIST')) + self.assertEqual([ele.text for ele in extract_element_list(parent, 'ARTIST', 'SUBARTIST')], compare) def test_extract_element_fail_not_exists(self): # Tests failure case of both extract element methods @@ -92,7 +95,7 @@ def test_extract_element_fail_not_exists(self): self.assertRaisesWithMessage('Element does not exist.', extract_element_basic, parent, 'NOT_THERE') # List - self.assertRaisesWithMessage('Element does not exist.', extract_element_list, parent, 'NOT_THERE') + self.assertRaisesWithMessage('Element does not exist.', extract_element_list, parent, 'NOT_THERE', 'STILL_NOT_THERE') def test_extract_element_fail_duplicate_key(self): # Tests failure case of both extract element methods @@ -103,14 +106,25 @@ def test_extract_element_fail_duplicate_key(self): self.assertRaisesWithMessage('Duplicate extraction tag.', extract_element_basic, parent, 'SUBARTIST') # List - self.assertRaisesWithMessage('Duplicate extraction tag.', extract_element_list, parent, 'SUBARTIST') + self.assertRaisesWithMessage('Duplicate extraction tag.', extract_element_list, parent, 'SUBARTIST', 'SUBSUBARTIST') def test_extract_element_basic_failure_not_basic(self): # Failure: sub-element is not type basic (text field, leaf node) parent = find_all_rec(self.tree.getroot(), 'CD')[0] - self.assertRaisesWithMessage('Duplicate extraction tag.', extract_element_basic, parent, 'ARTIST') + self.assertRaisesWithMessage('Element not basic type.', extract_element_basic, parent, 'ARTIST') def test_extract_element_failure_not_list(self): # Failure: sub-element is not type list () parent = find_all_rec(self.tree.getroot(), 'CD')[0] - self.assertRaisesWithMessage('Duplicate extraction tag.', extract_element_list, parent, 'TITLE') + self.assertRaisesWithMessage('Element not list type.', extract_element_list, self.tree.getroot(), 'CD', 'TRACK') + + def test_validate_extract_element(self): + parent = find_all_rec(self.tree.getroot(), 'CD')[0] + + # Does not exist + found_elements = find_all_rec(parent, 'NOT_THERE') + self.assertRaisesWithMessage('Element does not exist.', validate_extract_element, found_elements) + + # Duplicate tag + found_elements = find_all_rec(parent, 'SUBARTIST') + self.assertRaisesWithMessage('Duplicate extraction tag.', validate_extract_element, found_elements) From 0af78d13e3478333faa7131830eac73f1332ade8 Mon Sep 17 00:00:00 2001 From: Alex-Kennedy-Laptop Date: Thu, 21 Nov 2019 07:07:14 -0700 Subject: [PATCH 07/16] implemented bulk extract and tests all passed --- .gitignore | 1 - src/__init__.py | 3 +- src/xml_parse_to_csv.py | 40 +++++++++++++++++++++++-- test/test_xml_to_csv.py | 66 +++++++++++++++++++++++++++++++++++++---- 4 files changed, 100 insertions(+), 10 deletions(-) diff --git a/.gitignore b/.gitignore index db1f598..259c6f0 100644 --- a/.gitignore +++ b/.gitignore @@ -14,7 +14,6 @@ dist/ downloads/ eggs/ .eggs/ -lib/ lib64/ parts/ sdist/ diff --git a/src/__init__.py b/src/__init__.py index 9108c5b..da1434b 100644 --- a/src/__init__.py +++ b/src/__init__.py @@ -1 +1,2 @@ -from xml_parse_to_csv import read_xml_to_tree, find_all_rec, extract_element_basic \ No newline at end of file +# TODO: How can I fix this up? +from xml_parse_to_csv import read_xml_to_tree, find_all_rec, extract_element_basic, extract_element_list, validate_extract_element, bulk_extract \ No newline at end of file diff --git a/src/xml_parse_to_csv.py b/src/xml_parse_to_csv.py index e7c7fe7..7b2769e 100644 --- a/src/xml_parse_to_csv.py +++ b/src/xml_parse_to_csv.py @@ -67,8 +67,6 @@ def extract_element_list(element, list_tag, list_element_tag): # failure: element is not list type possible_list = extract_elements[0] - if not list(possible_list): # validate that the possible list element is not basic - raise Exception('Element not list type.') return_list = list(possible_list) for ele in return_list: # validate that list element is basic type and ele tag matches @@ -79,7 +77,6 @@ def extract_element_list(element, list_tag, list_element_tag): return return_list - def validate_extract_element(found_elements): """ Validate conditions for extract element: @@ -96,3 +93,40 @@ def validate_extract_element(found_elements): # failure: multiple elements with same tag if len(found_elements) > 1: raise Exception('Duplicate extraction tag.') + + +# TODO: make the assumption that one row element will not be inside another row element +# this means that in this problem, a Listing element will not be inside another Listing element +# This needs to be more robust... What happens if there are nested elements and we want to get information about both of +# them for a row? + +# TODO: default behavior turns a list into a string with elements sep by commas. Could add functionality to make this variable later. +def bulk_extract(tree, row_tag, elements_basic=[], elements_list=[]): + """ + Creates a dictionary with keys that represent the parent element of a row and values that are the columns in that row + Preparation for loading into a pandas DF + Structure: + { + key (row_element): value ( + { + key (sub-element tag): value (element text) + } + ) + } + :param tree: xml tree to load all this information from + :param row_tag: tag for the elements to extract from + :param elements_basic: list of tags for the basic elements to extract + :param elements_list: list of tags for the list elements to extract + :return: Dict prepared for DF load + """ + row_elements = find_all_rec(tree.getroot(), row_tag) + df_dict = {} + for element in row_elements: + df_row_entry = {} + for basics in elements_basic: + df_row_entry[basics] = extract_element_basic(element, basics).text + for lists in elements_list: + df_row_entry[lists["list_tag"]] = ", ".join([ele.text for ele in extract_element_list(element, lists["list_tag"], lists["list_element_tag"])]) + df_dict[element] = df_row_entry + return df_dict + diff --git a/test/test_xml_to_csv.py b/test/test_xml_to_csv.py index 8ee4432..a980205 100644 --- a/test/test_xml_to_csv.py +++ b/test/test_xml_to_csv.py @@ -7,6 +7,7 @@ from src import read_xml_to_tree, find_all_rec, extract_element_basic, extract_element_list, validate_extract_element +# TODO: all of this assumes that the client is connected to the internet. Could add some functionality to check this # TODO: on assert equals, i got the expected and actual backwards for most of these class TestXMLToCSV(TestCase): # TODO: TearDown @@ -14,7 +15,7 @@ def setUp(self): # read the xml from url to tree url = 'https://www.w3schools.com/xml/cd_catalog.xml' self.tree = read_xml_to_tree(url) - print(ET.tostring(self.tree.getroot())) + # print(ET.tostring(self.tree.getroot())) # create an instance of the tree with duplicate tags in tree for testing find all rec method self.tree_with_duplicate_tag = deepcopy(self.tree) @@ -30,6 +31,7 @@ def setUp(self): to_add = deepcopy(add_to) for child in add_to: child.append(to_add) + # print(ET.tostring(self.tree.getroot())) def test_read_xml_to_tree(self): root = self.tree.getroot() @@ -47,6 +49,60 @@ def test_find_all_rec(self): self.assertEqual(len(find_all_rec(self.tree.getroot(), 'TITLE')), 26) self.assertEqual(len(find_all_rec(self.tree_with_duplicate_tag.getroot(), "CD")), 7) + def test_bulk_extract(self): + test_tree = deepcopy(self.tree) + root = test_tree.getroot() + children = list(root) + + # add a list element to the first three to test that functionality + for child_i in range(3): + to_add = ET.Element('SUBARTIST') + to_add.text = 'TEST' + add_to = children[child_i].find('ARTIST') + for _ in range(child_i): + add_to.append(deepcopy(to_add)) + + # truncate to 3 children + for i in range(3, len(children)): + root.remove(children[i]) + + expected = [ + { + "TITLE": "Empire Burlesque", + "ARTIST": "" + }, + { + "TITLE": "Hide your heart", + "ARTIST": "TEST" + }, + { + "TITLE": "Greatest Hits", + "ARTIST": "TEST, TEST" + } + ] + + actual = src.bulk_extract( + test_tree, + "CD", + elements_basic=["TITLE"], + elements_list=[{"list_tag": "ARTIST", "list_element_tag": "SUBARTIST"}] + ).values() + + # TODO: comment and explain this + expected = list(expected) # make a mutable copy + try: + for elem in actual: + expected.remove(elem) + except ValueError: + return False + return not expected + + self.assertFalse(expected) + + # TODO: more error testing here + # assert exception for arguments that aren't valid + self.assertRaises(Exception, src.bulk_extract, test_tree, "CD", basic=["TITLE"], list=["ARTIST"]) + # TODO: can refactor 'parent = find_all_rec(self.tree.getroot(), 'CD')[0]' into setup class TestExtractElement(TestCase): @@ -75,8 +131,6 @@ def setUp(self): for i in range(3): add_to.append(deepcopy(to_add)) - print(ET.tostring(self.tree.getroot())) - def test_extract_element_success(self): parent = find_all_rec(self.tree.getroot(), 'CD')[0] # Basic @@ -95,7 +149,8 @@ def test_extract_element_fail_not_exists(self): self.assertRaisesWithMessage('Element does not exist.', extract_element_basic, parent, 'NOT_THERE') # List - self.assertRaisesWithMessage('Element does not exist.', extract_element_list, parent, 'NOT_THERE', 'STILL_NOT_THERE') + self.assertRaisesWithMessage('Element does not exist.', extract_element_list, parent, 'NOT_THERE', + 'STILL_NOT_THERE') def test_extract_element_fail_duplicate_key(self): # Tests failure case of both extract element methods @@ -106,7 +161,8 @@ def test_extract_element_fail_duplicate_key(self): self.assertRaisesWithMessage('Duplicate extraction tag.', extract_element_basic, parent, 'SUBARTIST') # List - self.assertRaisesWithMessage('Duplicate extraction tag.', extract_element_list, parent, 'SUBARTIST', 'SUBSUBARTIST') + self.assertRaisesWithMessage('Duplicate extraction tag.', extract_element_list, parent, 'SUBARTIST', + 'SUBSUBARTIST') def test_extract_element_basic_failure_not_basic(self): # Failure: sub-element is not type basic (text field, leaf node) From 94978aec5f1ea238151832af3e5da23e34ed3855 Mon Sep 17 00:00:00 2001 From: Alex Date: Fri, 22 Nov 2019 00:12:05 -0700 Subject: [PATCH 08/16] add custom exceptions for extract element --- src/exceptions.py | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100644 src/exceptions.py diff --git a/src/exceptions.py b/src/exceptions.py new file mode 100644 index 0000000..ce60186 --- /dev/null +++ b/src/exceptions.py @@ -0,0 +1,10 @@ +class AmbiguousElement(Exception): + pass + + +class MissingElement(Exception): + pass + + +class WrongElementType(Exception): + pass From 411d5d2686c5ec0fa9bb70ae48994132a8980531 Mon Sep 17 00:00:00 2001 From: Alex Date: Fri, 22 Nov 2019 00:12:31 -0700 Subject: [PATCH 09/16] make blank --- src/__init__.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/__init__.py b/src/__init__.py index da1434b..e69de29 100644 --- a/src/__init__.py +++ b/src/__init__.py @@ -1,2 +0,0 @@ -# TODO: How can I fix this up? -from xml_parse_to_csv import read_xml_to_tree, find_all_rec, extract_element_basic, extract_element_list, validate_extract_element, bulk_extract \ No newline at end of file From 4a867c5f6f38ac4e9bf5a25aa6186fb0d0cf11a6 Mon Sep 17 00:00:00 2001 From: Alex Date: Fri, 22 Nov 2019 00:13:10 -0700 Subject: [PATCH 10/16] minor import changes, refactor for new exceptions, update bulk_export functionality --- test/test_xml_to_csv.py | 91 ++++++++++++++++++++--------------------- 1 file changed, 45 insertions(+), 46 deletions(-) diff --git a/test/test_xml_to_csv.py b/test/test_xml_to_csv.py index a980205..4702ee8 100644 --- a/test/test_xml_to_csv.py +++ b/test/test_xml_to_csv.py @@ -1,10 +1,9 @@ +from pprint import pprint from unittest import TestCase from copy import deepcopy import xml.etree.ElementTree as ET - -# TODO: refactor all of these imports -import src -from src import read_xml_to_tree, find_all_rec, extract_element_basic, extract_element_list, validate_extract_element +from src.exceptions import MissingElement, AmbiguousElement, WrongElementType +import src.xml_parse_to_csv as xmltocsv # TODO: all of this assumes that the client is connected to the internet. Could add some functionality to check this @@ -14,7 +13,7 @@ class TestXMLToCSV(TestCase): def setUp(self): # read the xml from url to tree url = 'https://www.w3schools.com/xml/cd_catalog.xml' - self.tree = read_xml_to_tree(url) + self.tree = xmltocsv.read_xml_to_tree(url) # print(ET.tostring(self.tree.getroot())) # create an instance of the tree with duplicate tags in tree for testing find all rec method @@ -46,8 +45,8 @@ def test_read_xml_to_tree(self): self.assertEqual(len(list(child)), 6) def test_find_all_rec(self): - self.assertEqual(len(find_all_rec(self.tree.getroot(), 'TITLE')), 26) - self.assertEqual(len(find_all_rec(self.tree_with_duplicate_tag.getroot(), "CD")), 7) + self.assertEqual(len(xmltocsv.find_all_rec(self.tree.getroot(), 'TITLE')), 26) + self.assertEqual(len(xmltocsv.find_all_rec(self.tree_with_duplicate_tag.getroot(), "CD")), 7) def test_bulk_extract(self): test_tree = deepcopy(self.tree) @@ -81,44 +80,47 @@ def test_bulk_extract(self): } ] - actual = src.bulk_extract( + actual = xmltocsv.bulk_extract( test_tree, "CD", elements_basic=["TITLE"], elements_list=[{"list_tag": "ARTIST", "list_element_tag": "SUBARTIST"}] ).values() - + actual = [xmltocsv.stringify_bulk_extract(row_dict) for row_dict in actual] + print(pprint(actual)) # TODO: comment and explain this expected = list(expected) # make a mutable copy + expected_equals_actual = True try: for elem in actual: expected.remove(elem) except ValueError: - return False - return not expected + expected_equals_actual = False + if expected: + expected_equals_actual = False - self.assertFalse(expected) + self.assertTrue(expected_equals_actual) # TODO: more error testing here # assert exception for arguments that aren't valid - self.assertRaises(Exception, src.bulk_extract, test_tree, "CD", basic=["TITLE"], list=["ARTIST"]) + self.assertRaises(Exception, xmltocsv.bulk_extract, test_tree, "CD", basic=["TITLE"], list=["ARTIST"]) # TODO: can refactor 'parent = find_all_rec(self.tree.getroot(), 'CD')[0]' into setup class TestExtractElement(TestCase): - # Sauce: https://stackoverflow.com/questions/8672754/how-to-show-the-error-messages-caught-by-assertraises-in-unittest-in-python2-7 - def assertRaisesWithMessage(self, msg, func, *args, **kwargs): - try: - func(*args, **kwargs) - self.assertFail() - except Exception as inst: - self.assertEqual(inst.message, msg) + # # Sauce: https://stackoverflow.com/questions/8672754/how-to-show-the-error-messages-caught-by-assertraises-in-unittest-in-python2-7 + # def assertRaisesWithMessage(self, msg, func, *args, **kwargs): + # try: + # func(*args, **kwargs) + # self.assertFail() + # except Exception as inst: + # self.assertEqual(inst.message, msg) # TODO: Tear Down def setUp(self): # create CD catalog with 1 CD for easier testing url = 'https://www.w3schools.com/xml/cd_catalog.xml' - self.tree = read_xml_to_tree(url) + self.tree = xmltocsv.read_xml_to_tree(url) root = self.tree.getroot() children = list(root) for i in range(1, len(children)): @@ -127,60 +129,57 @@ def setUp(self): # create a list element for testing to_add = ET.Element('SUBARTIST') to_add.text = 'TEST' - add_to = find_all_rec(root, 'ARTIST')[0] + add_to = xmltocsv.find_all_rec(root, 'ARTIST')[0] for i in range(3): add_to.append(deepcopy(to_add)) def test_extract_element_success(self): - parent = find_all_rec(self.tree.getroot(), 'CD')[0] + parent = xmltocsv.find_all_rec(self.tree.getroot(), 'CD')[0] # Basic - self.assertEqual(extract_element_basic(parent, 'TITLE').text, 'Empire Burlesque') + self.assertEqual(xmltocsv.extract_basic(parent, 'TITLE').text, 'Empire Burlesque') # List compare = ['TEST', 'TEST', 'TEST'] - self.assertEqual([ele.text for ele in extract_element_list(parent, 'ARTIST', 'SUBARTIST')], compare) + self.assertEqual([ele.text for ele in xmltocsv.extract_list(parent, 'ARTIST', 'SUBARTIST')], compare) def test_extract_element_fail_not_exists(self): # Tests failure case of both extract element methods # Failure: Sub-element with tag does not exist within parent element - parent = find_all_rec(self.tree.getroot(), 'CD')[0] + parent = xmltocsv.find_all_rec(self.tree.getroot(), 'CD')[0] # Basic - self.assertRaisesWithMessage('Element does not exist.', extract_element_basic, parent, 'NOT_THERE') + self.assertRaises(MissingElement, xmltocsv.extract_basic, parent, 'NOT_THERE') # List - self.assertRaisesWithMessage('Element does not exist.', extract_element_list, parent, 'NOT_THERE', - 'STILL_NOT_THERE') + self.assertRaises(MissingElement, xmltocsv.extract_list, parent, 'NOT_THERE', 'STILL_NOT_THERE') def test_extract_element_fail_duplicate_key(self): # Tests failure case of both extract element methods # Failure: Multiple sub-elements found with tag within parent element - parent = find_all_rec(self.tree.getroot(), 'CD')[0] + parent = xmltocsv.find_all_rec(self.tree.getroot(), 'CD')[0] # Basic - self.assertRaisesWithMessage('Duplicate extraction tag.', extract_element_basic, parent, 'SUBARTIST') + self.assertRaises(AmbiguousElement, xmltocsv.extract_basic, parent, 'SUBARTIST') # List - self.assertRaisesWithMessage('Duplicate extraction tag.', extract_element_list, parent, 'SUBARTIST', - 'SUBSUBARTIST') + self.assertRaises(AmbiguousElement, xmltocsv.extract_list, parent, 'SUBARTIST', 'SUBSUBARTIST') - def test_extract_element_basic_failure_not_basic(self): - # Failure: sub-element is not type basic (text field, leaf node) - parent = find_all_rec(self.tree.getroot(), 'CD')[0] - self.assertRaisesWithMessage('Element not basic type.', extract_element_basic, parent, 'ARTIST') + def test_extract_element_failure_wrong_type(self): + parent = xmltocsv.find_all_rec(self.tree.getroot(), 'CD')[0] - def test_extract_element_failure_not_list(self): - # Failure: sub-element is not type list () - parent = find_all_rec(self.tree.getroot(), 'CD')[0] - self.assertRaisesWithMessage('Element not list type.', extract_element_list, self.tree.getroot(), 'CD', 'TRACK') + # Basic + self.assertRaises(WrongElementType, xmltocsv.extract_basic, parent, 'ARTIST') + + # List + self.assertRaises(WrongElementType, xmltocsv.extract_list, self.tree.getroot(), 'CD', 'TRACK') def test_validate_extract_element(self): - parent = find_all_rec(self.tree.getroot(), 'CD')[0] + parent = xmltocsv.find_all_rec(self.tree.getroot(), 'CD')[0] # Does not exist - found_elements = find_all_rec(parent, 'NOT_THERE') - self.assertRaisesWithMessage('Element does not exist.', validate_extract_element, found_elements) + found_elements = xmltocsv.find_all_rec(parent, 'NOT_THERE') + self.assertRaises(MissingElement, xmltocsv.validate_extract_element, found_elements) # Duplicate tag - found_elements = find_all_rec(parent, 'SUBARTIST') - self.assertRaisesWithMessage('Duplicate extraction tag.', validate_extract_element, found_elements) + found_elements = xmltocsv.find_all_rec(parent, 'SUBARTIST') + self.assertRaises(AmbiguousElement, xmltocsv.validate_extract_element, found_elements) From 8c31fbd10529c2942f127c9920fa48196cc2dead Mon Sep 17 00:00:00 2001 From: Alex Date: Fri, 22 Nov 2019 00:14:01 -0700 Subject: [PATCH 11/16] refactor exceptions, working stringify bulk extract --- src/xml_parse_to_csv.py | 59 ++++++++++++++++++++++++++++++----------- 1 file changed, 44 insertions(+), 15 deletions(-) diff --git a/src/xml_parse_to_csv.py b/src/xml_parse_to_csv.py index 7b2769e..52286bc 100644 --- a/src/xml_parse_to_csv.py +++ b/src/xml_parse_to_csv.py @@ -1,5 +1,8 @@ import xml.etree.ElementTree as ET import urllib2 +from copy import deepcopy + +from exceptions import AmbiguousElement, MissingElement, WrongElementType def read_xml_to_tree(url): @@ -13,13 +16,13 @@ def read_xml_to_tree(url): def find_all_rec(element, tag): - # TODO: could make this into a generator? """ Recursively find all elements with tag within element :param element: Element to search :param tag: Tag to search for :return: List of all sub-elements that have the tag """ + # TODO: could make this a generator matches = [] for child in list(element): if child.tag == tag: @@ -28,11 +31,8 @@ def find_all_rec(element, tag): return matches -# TODO: document exceptions for extract element -# TODO: add more element types that we can extract besides basic and list -# TODO: custom exceptions for the failure cases -# TODO: better error messages for the failure cases -def extract_element_basic(element, tag): +# TODO: add functionality for other element types +def extract_basic(element, tag): """ Extract a basic type (text field, leaf node) sub-element from parent with given tag @@ -46,13 +46,13 @@ def extract_element_basic(element, tag): # failure: element is not basic type return_element = extract_elements[0] if list(return_element): # if element has children, then not basic type - raise Exception('Element not basic type.') + raise WrongElementType() # success! return return_element -def extract_element_list(element, list_tag, list_element_tag): +def extract_list(element, list_tag, list_element_tag): """ Extract a list type (internal node, all children basic type) sub-element from parent with given list_tag and elements having list_element_tag @@ -71,7 +71,7 @@ def extract_element_list(element, list_tag, list_element_tag): return_list = list(possible_list) for ele in return_list: # validate that list element is basic type and ele tag matches if ele.tag != list_element_tag or list(ele): - raise Exception('Element not list type.') + raise WrongElementType("Expected 'List'") # success! return return_list @@ -88,11 +88,11 @@ def validate_extract_element(found_elements): """ # failure: element with tag doesn't exist if not found_elements: - raise Exception('Element does not exist.') + raise MissingElement() # failure: multiple elements with same tag if len(found_elements) > 1: - raise Exception('Duplicate extraction tag.') + raise AmbiguousElement() # TODO: make the assumption that one row element will not be inside another row element @@ -122,11 +122,40 @@ def bulk_extract(tree, row_tag, elements_basic=[], elements_list=[]): row_elements = find_all_rec(tree.getroot(), row_tag) df_dict = {} for element in row_elements: - df_row_entry = {} - for basics in elements_basic: - df_row_entry[basics] = extract_element_basic(element, basics).text + df_row_entry = { + "basics": {}, + "lists": {} + } + for basic in elements_basic: + df_row_entry["basics"][basic] = extract_basic(element, basic) for lists in elements_list: - df_row_entry[lists["list_tag"]] = ", ".join([ele.text for ele in extract_element_list(element, lists["list_tag"], lists["list_element_tag"])]) + df_row_entry["lists"][lists["list_tag"]] = extract_list(element, lists["list_tag"], lists["list_element_tag"]) df_dict[element] = df_row_entry return df_dict + +# TODO: docstring +def stringify_bulk_extract(extract_row): + a_copy = deepcopy(extract_row) + + # extract text from basic elements + bel_dict = a_copy["basics"] + for tag in bel_dict: + bel_dict[tag] = bel_dict[tag].text + + # extract text from list elements and join with comma according to project requirements + lel_dict = a_copy["lists"] + for tag in lel_dict: + for i in range(len(lel_dict[tag])): + lel_dict[tag][i] = lel_dict[tag][i].text + lel_dict[tag] = ", ".join(lel_dict[tag]) + + # remove top level of nested dictionary and compress basics and lists + # {'basics': {'TITLE': 'Greatest Hits'}, 'lists': {'ARTIST': 'TEST, TEST'}} -> + # {'TITLE': 'Greatest Hits', 'ARTIST': 'TEST, TEST'} + compressed = {} + for el_dict in a_copy.values(): + for key in el_dict: + compressed[key] = el_dict[key] + + return compressed From c3d94b32fab5ae9014a749c2c8a85989a0cf8581 Mon Sep 17 00:00:00 2001 From: Alex Date: Fri, 22 Nov 2019 01:36:30 -0700 Subject: [PATCH 12/16] add functionality for ancestry and tests for extract basic --- src/xml_parse_to_csv.py | 23 +++++++++++++++-------- test/test_xml_to_csv.py | 36 +++++++++++++++++++++++++++++++----- 2 files changed, 46 insertions(+), 13 deletions(-) diff --git a/src/xml_parse_to_csv.py b/src/xml_parse_to_csv.py index 52286bc..1fed519 100644 --- a/src/xml_parse_to_csv.py +++ b/src/xml_parse_to_csv.py @@ -1,3 +1,5 @@ +# TODO: change to python 2.7 and run all test cases + import xml.etree.ElementTree as ET import urllib2 from copy import deepcopy @@ -40,13 +42,17 @@ def extract_basic(element, tag): :param tag: Tag of element to extract :return: Extracted element as a string """ - extract_elements = find_all_rec(element, tag) - validate_extract_element(extract_elements) # validate exists and non-duplicate + ancestry = tag.split('.') + current = element + for sub_tag in ancestry: + extract_elements = find_all_rec(current, sub_tag) + validate_extract_element(extract_elements) # validate exists and non-duplicate + current = extract_elements[0] # failure: element is not basic type - return_element = extract_elements[0] + return_element = current if list(return_element): # if element has children, then not basic type - raise WrongElementType() + raise WrongElementType("Expected 'basic'.") # success! return return_element @@ -92,7 +98,7 @@ def validate_extract_element(found_elements): # failure: multiple elements with same tag if len(found_elements) > 1: - raise AmbiguousElement() + raise AmbiguousElement("{}".format(found_elements[0].tag)) # TODO: make the assumption that one row element will not be inside another row element @@ -101,7 +107,8 @@ def validate_extract_element(found_elements): # them for a row? # TODO: default behavior turns a list into a string with elements sep by commas. Could add functionality to make this variable later. -def bulk_extract(tree, row_tag, elements_basic=[], elements_list=[]): +# TODO: refactor +def bulk_extract(root, row_tag, elements_basic=[], elements_list=[]): """ Creates a dictionary with keys that represent the parent element of a row and values that are the columns in that row Preparation for loading into a pandas DF @@ -113,13 +120,13 @@ def bulk_extract(tree, row_tag, elements_basic=[], elements_list=[]): } ) } - :param tree: xml tree to load all this information from + :param root: xml tree to load all this information from :param row_tag: tag for the elements to extract from :param elements_basic: list of tags for the basic elements to extract :param elements_list: list of tags for the list elements to extract :return: Dict prepared for DF load """ - row_elements = find_all_rec(tree.getroot(), row_tag) + row_elements = find_all_rec(root, row_tag) df_dict = {} for element in row_elements: df_row_entry = { diff --git a/test/test_xml_to_csv.py b/test/test_xml_to_csv.py index 4702ee8..5f1a677 100644 --- a/test/test_xml_to_csv.py +++ b/test/test_xml_to_csv.py @@ -81,13 +81,12 @@ def test_bulk_extract(self): ] actual = xmltocsv.bulk_extract( - test_tree, + root, "CD", elements_basic=["TITLE"], elements_list=[{"list_tag": "ARTIST", "list_element_tag": "SUBARTIST"}] ).values() actual = [xmltocsv.stringify_bulk_extract(row_dict) for row_dict in actual] - print(pprint(actual)) # TODO: comment and explain this expected = list(expected) # make a mutable copy expected_equals_actual = True @@ -133,14 +132,41 @@ def setUp(self): for i in range(3): add_to.append(deepcopy(to_add)) + # TODO: add more layers of ancestry testing def test_extract_element_success(self): - parent = xmltocsv.find_all_rec(self.tree.getroot(), 'CD')[0] + parent_1LA = xmltocsv.find_all_rec(self.tree.getroot(), 'CD')[0] + + # build first test for 2 level ancestry realfake + parent_2LA_a = ET.Element("Top") + ET.SubElement(parent_2LA_a, "Mid") + bot_real = ET.Element("Bot") + bot_real.text = "real" + parent_2LA_a.find("Mid").append(bot_real) + bot_fake = ET.Element("Bot") + bot_fake.text = "decoy" + parent_2LA_a.append(bot_fake) + + # build second test for 2 level ancestry realfake + parent_2LA_b = ET.Element("Top") + ET.SubElement(parent_2LA_b, "Mid") + ET.SubElement(parent_2LA_b, "Mid2") + bot_real = ET.Element("Bot") + bot_real.text = "real" + parent_2LA_b.find("Mid").append(bot_real) + bot_fake = ET.Element("Bot") + bot_fake.text = "decoy" + parent_2LA_b.find("Mid2").append(bot_fake) + # Basic - self.assertEqual(xmltocsv.extract_basic(parent, 'TITLE').text, 'Empire Burlesque') + # 1 level ancestry + self.assertEqual(xmltocsv.extract_basic(parent_1LA, 'TITLE').text, 'Empire Burlesque') + # 2 level ancestry + self.assertEqual(xmltocsv.extract_basic(parent_2LA_a, 'Mid.Bot').text, 'real') + self.assertEqual(xmltocsv.extract_basic(parent_2LA_b, 'Mid2.Bot').text, 'decoy') # List compare = ['TEST', 'TEST', 'TEST'] - self.assertEqual([ele.text for ele in xmltocsv.extract_list(parent, 'ARTIST', 'SUBARTIST')], compare) + self.assertEqual([ele.text for ele in xmltocsv.extract_list(parent_1LA, 'ARTIST', 'SUBARTIST')], compare) def test_extract_element_fail_not_exists(self): # Tests failure case of both extract element methods From 614a6985c0fe3536e6a05f6cc924de452bbd13da Mon Sep 17 00:00:00 2001 From: Alex Date: Fri, 22 Nov 2019 01:38:14 -0700 Subject: [PATCH 13/16] add functionality for ancestry for extract_list --- src/xml_parse_to_csv.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/xml_parse_to_csv.py b/src/xml_parse_to_csv.py index 1fed519..d501716 100644 --- a/src/xml_parse_to_csv.py +++ b/src/xml_parse_to_csv.py @@ -68,11 +68,15 @@ def extract_list(element, list_tag, list_element_tag): :param list_element_tag: :return: Extracted element as a list """ - extract_elements = find_all_rec(element, list_tag) - validate_extract_element(extract_elements) # validate exists and non-duplicate + ancestry = list_tag.split('.') + current = element + for sub_tag in ancestry: + extract_elements = find_all_rec(current, sub_tag) + validate_extract_element(extract_elements) # validate exists and non-duplicate + current = extract_elements[0] # failure: element is not list type - possible_list = extract_elements[0] + possible_list = current return_list = list(possible_list) for ele in return_list: # validate that list element is basic type and ele tag matches From 0487f53a00826cdb12d496bc492afc8f41cb3150 Mon Sep 17 00:00:00 2001 From: Alex Date: Fri, 22 Nov 2019 02:19:34 -0700 Subject: [PATCH 14/16] add default missing element return to be a blank string for extract list and basic --- src/xml_parse_to_csv.py | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/src/xml_parse_to_csv.py b/src/xml_parse_to_csv.py index d501716..bab9763 100644 --- a/src/xml_parse_to_csv.py +++ b/src/xml_parse_to_csv.py @@ -46,7 +46,12 @@ def extract_basic(element, tag): current = element for sub_tag in ancestry: extract_elements = find_all_rec(current, sub_tag) - validate_extract_element(extract_elements) # validate exists and non-duplicate + try: + validate_extract_element(extract_elements) # validate exists and non-duplicate + except MissingElement: # default missing element behavior return blank element + blank = ET.Element("Blank") + blank.text = "" + return blank current = extract_elements[0] # failure: element is not basic type @@ -72,7 +77,12 @@ def extract_list(element, list_tag, list_element_tag): current = element for sub_tag in ancestry: extract_elements = find_all_rec(current, sub_tag) - validate_extract_element(extract_elements) # validate exists and non-duplicate + try: + validate_extract_element(extract_elements) # validate exists and non-duplicate + except MissingElement: # default missing element behavior return blank element + blank = ET.Element("Blank") + blank.text = "" + return blank current = extract_elements[0] # failure: element is not list type @@ -104,7 +114,7 @@ def validate_extract_element(found_elements): if len(found_elements) > 1: raise AmbiguousElement("{}".format(found_elements[0].tag)) - +# TODO: missing element defaults to return empty string # TODO: make the assumption that one row element will not be inside another row element # this means that in this problem, a Listing element will not be inside another Listing element # This needs to be more robust... What happens if there are nested elements and we want to get information about both of From adeee61b70866f1f5fea1abc6b251c74a676897b Mon Sep 17 00:00:00 2001 From: Alex Date: Fri, 22 Nov 2019 02:20:22 -0700 Subject: [PATCH 15/16] make csv from xml according to project requirements --- listing_xml_to_csv.py | 64 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 64 insertions(+) create mode 100644 listing_xml_to_csv.py diff --git a/listing_xml_to_csv.py b/listing_xml_to_csv.py new file mode 100644 index 0000000..71e48b6 --- /dev/null +++ b/listing_xml_to_csv.py @@ -0,0 +1,64 @@ +from src.xml_parse_to_csv import read_xml_to_tree, bulk_extract, stringify_bulk_extract +import pandas as pd +import datetime +import re + +# get the xml tree +url = "http://syndication.enterprise.websiteidx.com/feeds/BoojCodeTest.xml" +tree = read_xml_to_tree(url) + +# bulk extract the columns that we want +row_tag = "Listing" + +basic_fields = [ + "MlsId", + "MlsName", + "DateListed", + "Location.StreetAddress", + "Price", + "Bedrooms", + "Bathrooms", + "BasicDetails.Description" +] + +list_fields = [ + { + "list_tag": "Appliances", + "list_element_tag": "Appliance" + }, + { + "list_tag": "Rooms", + "list_element_tag": "Room" + } +] + +row_dicts = bulk_extract( + tree.getroot(), + row_tag, + elements_basic=basic_fields, + elements_list=list_fields +).values() + +row_list = [stringify_bulk_extract(row) for row in row_dicts] + +df = pd.DataFrame(row_list) + +# date listed == 2016 +df["year"] = df["DateListed"].apply(datetime.datetime.strptime, args=('%Y-%m-%d %H:%M:%S',)) +df["year"] = df["year"].apply(lambda x: x.year) +df = df[df["year"] == 2016] + +print(df.columns) + +# filter description "and" +df["description_and"] = df["BasicDetails.Description"].apply(lambda x: bool(re.search(r".*and.*", x))) +df = df[df["description_and"]] + +# order by date listed +df.sort_values(by=["DateListed"], inplace=True) + +# reorder and get columns +order = basic_fields + [field["list_tag"] for field in list_fields] +df = df[order] + +df.to_csv('test.csv', index=False) From 29b1f9e8c58044905ee6c3252204489ec297aa10 Mon Sep 17 00:00:00 2001 From: Alex Date: Fri, 22 Nov 2019 13:12:40 -0700 Subject: [PATCH 16/16] final submission commit --- listing_xml_to_csv.py | 6 +-- requirements.txt | 5 +++ src/xml_parse_to_csv.py | 97 ++++++++++++++++++++++++++++++----------- test/test_xml_to_csv.py | 51 ++++++++++------------ 4 files changed, 101 insertions(+), 58 deletions(-) create mode 100644 requirements.txt diff --git a/listing_xml_to_csv.py b/listing_xml_to_csv.py index 71e48b6..eed4219 100644 --- a/listing_xml_to_csv.py +++ b/listing_xml_to_csv.py @@ -1,3 +1,4 @@ +# Alex Kennedy 11-22-19 from src.xml_parse_to_csv import read_xml_to_tree, bulk_extract, stringify_bulk_extract import pandas as pd import datetime @@ -10,6 +11,7 @@ # bulk extract the columns that we want row_tag = "Listing" +# Some fields use dot notation to represent ancestry of the element, used to resolve AmbiguousElement exceptions basic_fields = [ "MlsId", "MlsName", @@ -48,8 +50,6 @@ df["year"] = df["year"].apply(lambda x: x.year) df = df[df["year"] == 2016] -print(df.columns) - # filter description "and" df["description_and"] = df["BasicDetails.Description"].apply(lambda x: bool(re.search(r".*and.*", x))) df = df[df["description_and"]] @@ -61,4 +61,4 @@ order = basic_fields + [field["list_tag"] for field in list_fields] df = df[order] -df.to_csv('test.csv', index=False) +df.to_csv('zillow_data.csv', index=False) diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..c45b133 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,5 @@ +numpy==1.16.5 +pandas==0.24.2 +python-dateutil==2.8.1 +pytz==2019.3 +six==1.13.0 diff --git a/src/xml_parse_to_csv.py b/src/xml_parse_to_csv.py index bab9763..2901869 100644 --- a/src/xml_parse_to_csv.py +++ b/src/xml_parse_to_csv.py @@ -1,15 +1,24 @@ -# TODO: change to python 2.7 and run all test cases - +# Alex Kennedy 11-22-19 +# Library Imports import xml.etree.ElementTree as ET import urllib2 from copy import deepcopy +# My Imports from exceptions import AmbiguousElement, MissingElement, WrongElementType +# Out of Scope TODOs +# TODO: assumes that the client is connected to the internet. could add some functionality to check this +# TODO: change find_all_rec to generator func +# TODO: add variable behavior for default MissingElement() error handling, default blank string return +# TODO: default behavior turns a list into a string with elements sep by commas, create variable functionality? +# TODO: add functionality for other element types besides 'Basic' and 'List' + def read_xml_to_tree(url): """ Read XML from URL into ElementTree object + :param url: XML file url :return: ElementTree of XML object """ @@ -20,11 +29,11 @@ def read_xml_to_tree(url): def find_all_rec(element, tag): """ Recursively find all elements with tag within element + :param element: Element to search :param tag: Tag to search for :return: List of all sub-elements that have the tag """ - # TODO: could make this a generator matches = [] for child in list(element): if child.tag == tag: @@ -33,19 +42,20 @@ def find_all_rec(element, tag): return matches -# TODO: add functionality for other element types def extract_basic(element, tag): """ - Extract a basic type (text field, leaf node) sub-element from parent with given tag + Extract and validate a basic type (text field, leaf node) sub-element from parent with given tag :param element: Parent element to extract from :param tag: Tag of element to extract :return: Extracted element as a string """ + # parse and traverse ancestry ancestry = tag.split('.') current = element for sub_tag in ancestry: extract_elements = find_all_rec(current, sub_tag) + # validate exists and non-duplicate for ancestor try: validate_extract_element(extract_elements) # validate exists and non-duplicate except MissingElement: # default missing element behavior return blank element @@ -65,14 +75,15 @@ def extract_basic(element, tag): def extract_list(element, list_tag, list_element_tag): """ - Extract a list type (internal node, all children basic type) sub-element from parent with given list_tag and - elements having list_element_tag + Extract and validate a list type (internal node, all children basic type) sub-element from + parent with given list_tag and elements having list_element_tag :param element: parent element :param list_tag: :param list_element_tag: :return: Extracted element as a list """ + # parse and traverse ancestry ancestry = list_tag.split('.') current = element for sub_tag in ancestry: @@ -87,7 +98,6 @@ def extract_list(element, list_tag, list_element_tag): # failure: element is not list type possible_list = current - return_list = list(possible_list) for ele in return_list: # validate that list element is basic type and ele tag matches if ele.tag != list_element_tag or list(ele): @@ -103,7 +113,8 @@ def validate_extract_element(found_elements): Element must exist Element must not have duplicate tag as another element Helper method for extract_element_... methods - :param found_elements: Preliminary element list found by extract_element_... + + :param found_elements: Preliminary element list found by extract_... :return: None """ # failure: element with tag doesn't exist @@ -114,33 +125,40 @@ def validate_extract_element(found_elements): if len(found_elements) > 1: raise AmbiguousElement("{}".format(found_elements[0].tag)) -# TODO: missing element defaults to return empty string -# TODO: make the assumption that one row element will not be inside another row element -# this means that in this problem, a Listing element will not be inside another Listing element -# This needs to be more robust... What happens if there are nested elements and we want to get information about both of -# them for a row? -# TODO: default behavior turns a list into a string with elements sep by commas. Could add functionality to make this variable later. -# TODO: refactor def bulk_extract(root, row_tag, elements_basic=[], elements_list=[]): """ - Creates a dictionary with keys that represent the parent element of a row and values that are the columns in that row - Preparation for loading into a pandas DF + Creates a dictionary (structure below) of a bulk extract from an XML tree, preparation for pd.DataFrame + Structure: { key (row_element): value ( { - key (sub-element tag): value (element text) + "basics": { + basic_tag: basic_element, + ... + }, + "lists": { + list_tag: [ + list_sub_element, + ... + ], + ... + } } ) } - :param root: xml tree to load all this information from - :param row_tag: tag for the elements to extract from - :param elements_basic: list of tags for the basic elements to extract - :param elements_list: list of tags for the list elements to extract - :return: Dict prepared for DF load + + :param root: root of tree + :param row_tag: tag for the parent elements to extract from + :param elements_basic: basic element to extract tags + :param elements_list: list element to extract tags + :return: Dict prepared for DF load (structure above) """ + # get all row elements to parse row_elements = find_all_rec(root, row_tag) + + # construct the dictionary with basic and list elements df_dict = {} for element in row_elements: df_row_entry = { @@ -150,13 +168,40 @@ def bulk_extract(root, row_tag, elements_basic=[], elements_list=[]): for basic in elements_basic: df_row_entry["basics"][basic] = extract_basic(element, basic) for lists in elements_list: - df_row_entry["lists"][lists["list_tag"]] = extract_list(element, lists["list_tag"], lists["list_element_tag"]) + df_row_entry["lists"][lists["list_tag"]] = extract_list( + element, lists["list_tag"], lists["list_element_tag"] + ) df_dict[element] = df_row_entry + return df_dict -# TODO: docstring def stringify_bulk_extract(extract_row): + """ + Stringify a build extract row: + Remove parent element, + compress basics and lists dicts, + extract text from each , + make list into ',' joined string + + Example: + Input + {: {'basics': {'TITLE': }, + 'lists': {'ARTIST': []}}, + : {'basics': {'TITLE': }, + 'lists': {'ARTIST': []}}, + : {'basics': {'TITLE': }, + 'lists': {'ARTIST': [, + ]}}} + + Output + [{'ARTIST': '', 'TITLE': 'Empire Burlesque'}, + {'ARTIST': 'TEST', 'TITLE': 'Hide your heart'}, + {'ARTIST': 'TEST, TEST', 'TITLE': 'Greatest Hits'}] + + :param extract_row: row dict to extract from + :return: List of CSV rows + """ a_copy = deepcopy(extract_row) # extract text from basic elements diff --git a/test/test_xml_to_csv.py b/test/test_xml_to_csv.py index 5f1a677..fc9feef 100644 --- a/test/test_xml_to_csv.py +++ b/test/test_xml_to_csv.py @@ -1,20 +1,24 @@ -from pprint import pprint +# Alex Kennedy 11-22-19 +# Library Imports from unittest import TestCase from copy import deepcopy import xml.etree.ElementTree as ET + +# My Imports from src.exceptions import MissingElement, AmbiguousElement, WrongElementType import src.xml_parse_to_csv as xmltocsv +# Out of Scope TODOs +# TODO: add more layers of ancestry testing for extract_element +# TODO: expand bulk_extract asserts for further coverage +# TODO: flipped expected and actual for assertEquals + -# TODO: all of this assumes that the client is connected to the internet. Could add some functionality to check this -# TODO: on assert equals, i got the expected and actual backwards for most of these class TestXMLToCSV(TestCase): - # TODO: TearDown def setUp(self): # read the xml from url to tree url = 'https://www.w3schools.com/xml/cd_catalog.xml' self.tree = xmltocsv.read_xml_to_tree(url) - # print(ET.tostring(self.tree.getroot())) # create an instance of the tree with duplicate tags in tree for testing find all rec method self.tree_with_duplicate_tag = deepcopy(self.tree) @@ -30,7 +34,6 @@ def setUp(self): to_add = deepcopy(add_to) for child in add_to: child.append(to_add) - # print(ET.tostring(self.tree.getroot())) def test_read_xml_to_tree(self): root = self.tree.getroot() @@ -40,7 +43,7 @@ def test_read_xml_to_tree(self): self.assertEqual(root.tag, 'CATALOG') self.assertEqual(len(list(root)), 26) - # check that each of the children are parsing all of their children + # check that we parse nested elements for child in root: self.assertEqual(len(list(child)), 6) @@ -87,7 +90,8 @@ def test_bulk_extract(self): elements_list=[{"list_tag": "ARTIST", "list_element_tag": "SUBARTIST"}] ).values() actual = [xmltocsv.stringify_bulk_extract(row_dict) for row_dict in actual] - # TODO: comment and explain this + + # test that expected = actual but ignore order of the list expected = list(expected) # make a mutable copy expected_equals_actual = True try: @@ -100,22 +104,11 @@ def test_bulk_extract(self): self.assertTrue(expected_equals_actual) - # TODO: more error testing here # assert exception for arguments that aren't valid self.assertRaises(Exception, xmltocsv.bulk_extract, test_tree, "CD", basic=["TITLE"], list=["ARTIST"]) -# TODO: can refactor 'parent = find_all_rec(self.tree.getroot(), 'CD')[0]' into setup class TestExtractElement(TestCase): - # # Sauce: https://stackoverflow.com/questions/8672754/how-to-show-the-error-messages-caught-by-assertraises-in-unittest-in-python2-7 - # def assertRaisesWithMessage(self, msg, func, *args, **kwargs): - # try: - # func(*args, **kwargs) - # self.assertFail() - # except Exception as inst: - # self.assertEqual(inst.message, msg) - - # TODO: Tear Down def setUp(self): # create CD catalog with 1 CD for easier testing url = 'https://www.w3schools.com/xml/cd_catalog.xml' @@ -132,9 +125,11 @@ def setUp(self): for i in range(3): add_to.append(deepcopy(to_add)) - # TODO: add more layers of ancestry testing + # create the root element for testing functions + self.test_root = xmltocsv.find_all_rec(self.tree.getroot(), 'CD')[0] + def test_extract_element_success(self): - parent_1LA = xmltocsv.find_all_rec(self.tree.getroot(), 'CD')[0] + parent_1LA = self.test_root # build first test for 2 level ancestry realfake parent_2LA_a = ET.Element("Top") @@ -169,20 +164,18 @@ def test_extract_element_success(self): self.assertEqual([ele.text for ele in xmltocsv.extract_list(parent_1LA, 'ARTIST', 'SUBARTIST')], compare) def test_extract_element_fail_not_exists(self): - # Tests failure case of both extract element methods # Failure: Sub-element with tag does not exist within parent element - parent = xmltocsv.find_all_rec(self.tree.getroot(), 'CD')[0] + parent = self.test_root # Basic - self.assertRaises(MissingElement, xmltocsv.extract_basic, parent, 'NOT_THERE') + self.assertEqual(xmltocsv.extract_basic(parent, 'NOT_THERE').text, "") # List - self.assertRaises(MissingElement, xmltocsv.extract_list, parent, 'NOT_THERE', 'STILL_NOT_THERE') + self.assertEqual(xmltocsv.extract_list(parent, 'NOT_THERE', 'STILL_NOT_THERE').text, "") def test_extract_element_fail_duplicate_key(self): - # Tests failure case of both extract element methods # Failure: Multiple sub-elements found with tag within parent element - parent = xmltocsv.find_all_rec(self.tree.getroot(), 'CD')[0] + parent = self.test_root # Basic self.assertRaises(AmbiguousElement, xmltocsv.extract_basic, parent, 'SUBARTIST') @@ -191,7 +184,7 @@ def test_extract_element_fail_duplicate_key(self): self.assertRaises(AmbiguousElement, xmltocsv.extract_list, parent, 'SUBARTIST', 'SUBSUBARTIST') def test_extract_element_failure_wrong_type(self): - parent = xmltocsv.find_all_rec(self.tree.getroot(), 'CD')[0] + parent = self.test_root # Basic self.assertRaises(WrongElementType, xmltocsv.extract_basic, parent, 'ARTIST') @@ -200,7 +193,7 @@ def test_extract_element_failure_wrong_type(self): self.assertRaises(WrongElementType, xmltocsv.extract_list, self.tree.getroot(), 'CD', 'TRACK') def test_validate_extract_element(self): - parent = xmltocsv.find_all_rec(self.tree.getroot(), 'CD')[0] + parent = self.test_root # Does not exist found_elements = xmltocsv.find_all_rec(parent, 'NOT_THERE')