diff --git a/.gitignore b/.gitignore index db1f598..259c6f0 100644 --- a/.gitignore +++ b/.gitignore @@ -14,7 +14,6 @@ dist/ downloads/ eggs/ .eggs/ -lib/ lib64/ parts/ sdist/ diff --git a/listing_xml_to_csv.py b/listing_xml_to_csv.py new file mode 100644 index 0000000..eed4219 --- /dev/null +++ b/listing_xml_to_csv.py @@ -0,0 +1,64 @@ +# Alex Kennedy 11-22-19 +from src.xml_parse_to_csv import read_xml_to_tree, bulk_extract, stringify_bulk_extract +import pandas as pd +import datetime +import re + +# get the xml tree +url = "http://syndication.enterprise.websiteidx.com/feeds/BoojCodeTest.xml" +tree = read_xml_to_tree(url) + +# bulk extract the columns that we want +row_tag = "Listing" + +# Some fields use dot notation to represent ancestry of the element, used to resolve AmbiguousElement exceptions +basic_fields = [ + "MlsId", + "MlsName", + "DateListed", + "Location.StreetAddress", + "Price", + "Bedrooms", + "Bathrooms", + "BasicDetails.Description" +] + +list_fields = [ + { + "list_tag": "Appliances", + "list_element_tag": "Appliance" + }, + { + "list_tag": "Rooms", + "list_element_tag": "Room" + } +] + +row_dicts = bulk_extract( + tree.getroot(), + row_tag, + elements_basic=basic_fields, + elements_list=list_fields +).values() + +row_list = [stringify_bulk_extract(row) for row in row_dicts] + +df = pd.DataFrame(row_list) + +# date listed == 2016 +df["year"] = df["DateListed"].apply(datetime.datetime.strptime, args=('%Y-%m-%d %H:%M:%S',)) +df["year"] = df["year"].apply(lambda x: x.year) +df = df[df["year"] == 2016] + +# filter description "and" +df["description_and"] = df["BasicDetails.Description"].apply(lambda x: bool(re.search(r".*and.*", x))) +df = df[df["description_and"]] + +# order by date listed +df.sort_values(by=["DateListed"], inplace=True) + +# reorder and get columns +order = basic_fields + [field["list_tag"] for field in list_fields] +df = df[order] + +df.to_csv('zillow_data.csv', index=False) diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..c45b133 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,5 @@ +numpy==1.16.5 +pandas==0.24.2 +python-dateutil==2.8.1 +pytz==2019.3 +six==1.13.0 diff --git a/src/__init__.py b/src/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/exceptions.py b/src/exceptions.py new file mode 100644 index 0000000..ce60186 --- /dev/null +++ b/src/exceptions.py @@ -0,0 +1,10 @@ +class AmbiguousElement(Exception): + pass + + +class MissingElement(Exception): + pass + + +class WrongElementType(Exception): + pass diff --git a/src/xml_parse_to_csv.py b/src/xml_parse_to_csv.py new file mode 100644 index 0000000..2901869 --- /dev/null +++ b/src/xml_parse_to_csv.py @@ -0,0 +1,227 @@ +# Alex Kennedy 11-22-19 +# Library Imports +import xml.etree.ElementTree as ET +import urllib2 +from copy import deepcopy + +# My Imports +from exceptions import AmbiguousElement, MissingElement, WrongElementType + +# Out of Scope TODOs +# TODO: assumes that the client is connected to the internet. could add some functionality to check this +# TODO: change find_all_rec to generator func +# TODO: add variable behavior for default MissingElement() error handling, default blank string return +# TODO: default behavior turns a list into a string with elements sep by commas, create variable functionality? +# TODO: add functionality for other element types besides 'Basic' and 'List' + + +def read_xml_to_tree(url): + """ + Read XML from URL into ElementTree object + + :param url: XML file url + :return: ElementTree of XML object + """ + f_xml = urllib2.urlopen(url) + return ET.parse(f_xml) + + +def find_all_rec(element, tag): + """ + Recursively find all elements with tag within element + + :param element: Element to search + :param tag: Tag to search for + :return: List of all sub-elements that have the tag + """ + matches = [] + for child in list(element): + if child.tag == tag: + matches += [child] + matches += find_all_rec(child, tag) + return matches + + +def extract_basic(element, tag): + """ + Extract and validate a basic type (text field, leaf node) sub-element from parent with given tag + + :param element: Parent element to extract from + :param tag: Tag of element to extract + :return: Extracted element as a string + """ + # parse and traverse ancestry + ancestry = tag.split('.') + current = element + for sub_tag in ancestry: + extract_elements = find_all_rec(current, sub_tag) + # validate exists and non-duplicate for ancestor + try: + validate_extract_element(extract_elements) # validate exists and non-duplicate + except MissingElement: # default missing element behavior return blank element + blank = ET.Element("Blank") + blank.text = "" + return blank + current = extract_elements[0] + + # failure: element is not basic type + return_element = current + if list(return_element): # if element has children, then not basic type + raise WrongElementType("Expected 'basic'.") + + # success! + return return_element + + +def extract_list(element, list_tag, list_element_tag): + """ + Extract and validate a list type (internal node, all children basic type) sub-element from + parent with given list_tag and elements having list_element_tag + + :param element: parent element + :param list_tag: + :param list_element_tag: + :return: Extracted element as a list + """ + # parse and traverse ancestry + ancestry = list_tag.split('.') + current = element + for sub_tag in ancestry: + extract_elements = find_all_rec(current, sub_tag) + try: + validate_extract_element(extract_elements) # validate exists and non-duplicate + except MissingElement: # default missing element behavior return blank element + blank = ET.Element("Blank") + blank.text = "" + return blank + current = extract_elements[0] + + # failure: element is not list type + possible_list = current + return_list = list(possible_list) + for ele in return_list: # validate that list element is basic type and ele tag matches + if ele.tag != list_element_tag or list(ele): + raise WrongElementType("Expected 'List'") + + # success! + return return_list + + +def validate_extract_element(found_elements): + """ + Validate conditions for extract element: + Element must exist + Element must not have duplicate tag as another element + Helper method for extract_element_... methods + + :param found_elements: Preliminary element list found by extract_... + :return: None + """ + # failure: element with tag doesn't exist + if not found_elements: + raise MissingElement() + + # failure: multiple elements with same tag + if len(found_elements) > 1: + raise AmbiguousElement("{}".format(found_elements[0].tag)) + + +def bulk_extract(root, row_tag, elements_basic=[], elements_list=[]): + """ + Creates a dictionary (structure below) of a bulk extract from an XML tree, preparation for pd.DataFrame + + Structure: + { + key (row_element): value ( + { + "basics": { + basic_tag: basic_element, + ... + }, + "lists": { + list_tag: [ + list_sub_element, + ... + ], + ... + } + } + ) + } + + :param root: root of tree + :param row_tag: tag for the parent elements to extract from + :param elements_basic: basic element to extract tags + :param elements_list: list element to extract tags + :return: Dict prepared for DF load (structure above) + """ + # get all row elements to parse + row_elements = find_all_rec(root, row_tag) + + # construct the dictionary with basic and list elements + df_dict = {} + for element in row_elements: + df_row_entry = { + "basics": {}, + "lists": {} + } + for basic in elements_basic: + df_row_entry["basics"][basic] = extract_basic(element, basic) + for lists in elements_list: + df_row_entry["lists"][lists["list_tag"]] = extract_list( + element, lists["list_tag"], lists["list_element_tag"] + ) + df_dict[element] = df_row_entry + + return df_dict + + +def stringify_bulk_extract(extract_row): + """ + Stringify a build extract row: + Remove parent element, + compress basics and lists dicts, + extract text from each , + make list into ',' joined string + + Example: + Input + {: {'basics': {'TITLE': }, + 'lists': {'ARTIST': []}}, + : {'basics': {'TITLE': }, + 'lists': {'ARTIST': []}}, + : {'basics': {'TITLE': }, + 'lists': {'ARTIST': [, + ]}}} + + Output + [{'ARTIST': '', 'TITLE': 'Empire Burlesque'}, + {'ARTIST': 'TEST', 'TITLE': 'Hide your heart'}, + {'ARTIST': 'TEST, TEST', 'TITLE': 'Greatest Hits'}] + + :param extract_row: row dict to extract from + :return: List of CSV rows + """ + a_copy = deepcopy(extract_row) + + # extract text from basic elements + bel_dict = a_copy["basics"] + for tag in bel_dict: + bel_dict[tag] = bel_dict[tag].text + + # extract text from list elements and join with comma according to project requirements + lel_dict = a_copy["lists"] + for tag in lel_dict: + for i in range(len(lel_dict[tag])): + lel_dict[tag][i] = lel_dict[tag][i].text + lel_dict[tag] = ", ".join(lel_dict[tag]) + + # remove top level of nested dictionary and compress basics and lists + # {'basics': {'TITLE': 'Greatest Hits'}, 'lists': {'ARTIST': 'TEST, TEST'}} -> + # {'TITLE': 'Greatest Hits', 'ARTIST': 'TEST, TEST'} + compressed = {} + for el_dict in a_copy.values(): + for key in el_dict: + compressed[key] = el_dict[key] + + return compressed diff --git a/test/test_xml_to_csv.py b/test/test_xml_to_csv.py new file mode 100644 index 0000000..fc9feef --- /dev/null +++ b/test/test_xml_to_csv.py @@ -0,0 +1,204 @@ +# Alex Kennedy 11-22-19 +# Library Imports +from unittest import TestCase +from copy import deepcopy +import xml.etree.ElementTree as ET + +# My Imports +from src.exceptions import MissingElement, AmbiguousElement, WrongElementType +import src.xml_parse_to_csv as xmltocsv + +# Out of Scope TODOs +# TODO: add more layers of ancestry testing for extract_element +# TODO: expand bulk_extract asserts for further coverage +# TODO: flipped expected and actual for assertEquals + + +class TestXMLToCSV(TestCase): + def setUp(self): + # read the xml from url to tree + url = 'https://www.w3schools.com/xml/cd_catalog.xml' + self.tree = xmltocsv.read_xml_to_tree(url) + + # create an instance of the tree with duplicate tags in tree for testing find all rec method + self.tree_with_duplicate_tag = deepcopy(self.tree) + root = self.tree_with_duplicate_tag.getroot() + + # delete all children except the first one + children = list(root) + for i in range(1, len(children)): + root.remove(children[i]) + + # add a copy of the only child of tree to each grandchild + add_to = list(root)[0] + to_add = deepcopy(add_to) + for child in add_to: + child.append(to_add) + + def test_read_xml_to_tree(self): + root = self.tree.getroot() + + # basic xml tree structure inserts + self.assertIsInstance(self.tree, ET.ElementTree) + self.assertEqual(root.tag, 'CATALOG') + self.assertEqual(len(list(root)), 26) + + # check that we parse nested elements + for child in root: + self.assertEqual(len(list(child)), 6) + + def test_find_all_rec(self): + self.assertEqual(len(xmltocsv.find_all_rec(self.tree.getroot(), 'TITLE')), 26) + self.assertEqual(len(xmltocsv.find_all_rec(self.tree_with_duplicate_tag.getroot(), "CD")), 7) + + def test_bulk_extract(self): + test_tree = deepcopy(self.tree) + root = test_tree.getroot() + children = list(root) + + # add a list element to the first three to test that functionality + for child_i in range(3): + to_add = ET.Element('SUBARTIST') + to_add.text = 'TEST' + add_to = children[child_i].find('ARTIST') + for _ in range(child_i): + add_to.append(deepcopy(to_add)) + + # truncate to 3 children + for i in range(3, len(children)): + root.remove(children[i]) + + expected = [ + { + "TITLE": "Empire Burlesque", + "ARTIST": "" + }, + { + "TITLE": "Hide your heart", + "ARTIST": "TEST" + }, + { + "TITLE": "Greatest Hits", + "ARTIST": "TEST, TEST" + } + ] + + actual = xmltocsv.bulk_extract( + root, + "CD", + elements_basic=["TITLE"], + elements_list=[{"list_tag": "ARTIST", "list_element_tag": "SUBARTIST"}] + ).values() + actual = [xmltocsv.stringify_bulk_extract(row_dict) for row_dict in actual] + + # test that expected = actual but ignore order of the list + expected = list(expected) # make a mutable copy + expected_equals_actual = True + try: + for elem in actual: + expected.remove(elem) + except ValueError: + expected_equals_actual = False + if expected: + expected_equals_actual = False + + self.assertTrue(expected_equals_actual) + + # assert exception for arguments that aren't valid + self.assertRaises(Exception, xmltocsv.bulk_extract, test_tree, "CD", basic=["TITLE"], list=["ARTIST"]) + + +class TestExtractElement(TestCase): + def setUp(self): + # create CD catalog with 1 CD for easier testing + url = 'https://www.w3schools.com/xml/cd_catalog.xml' + self.tree = xmltocsv.read_xml_to_tree(url) + root = self.tree.getroot() + children = list(root) + for i in range(1, len(children)): + root.remove(children[i]) + + # create a list element for testing + to_add = ET.Element('SUBARTIST') + to_add.text = 'TEST' + add_to = xmltocsv.find_all_rec(root, 'ARTIST')[0] + for i in range(3): + add_to.append(deepcopy(to_add)) + + # create the root element for testing functions + self.test_root = xmltocsv.find_all_rec(self.tree.getroot(), 'CD')[0] + + def test_extract_element_success(self): + parent_1LA = self.test_root + + # build first test for 2 level ancestry realfake + parent_2LA_a = ET.Element("Top") + ET.SubElement(parent_2LA_a, "Mid") + bot_real = ET.Element("Bot") + bot_real.text = "real" + parent_2LA_a.find("Mid").append(bot_real) + bot_fake = ET.Element("Bot") + bot_fake.text = "decoy" + parent_2LA_a.append(bot_fake) + + # build second test for 2 level ancestry realfake + parent_2LA_b = ET.Element("Top") + ET.SubElement(parent_2LA_b, "Mid") + ET.SubElement(parent_2LA_b, "Mid2") + bot_real = ET.Element("Bot") + bot_real.text = "real" + parent_2LA_b.find("Mid").append(bot_real) + bot_fake = ET.Element("Bot") + bot_fake.text = "decoy" + parent_2LA_b.find("Mid2").append(bot_fake) + + # Basic + # 1 level ancestry + self.assertEqual(xmltocsv.extract_basic(parent_1LA, 'TITLE').text, 'Empire Burlesque') + # 2 level ancestry + self.assertEqual(xmltocsv.extract_basic(parent_2LA_a, 'Mid.Bot').text, 'real') + self.assertEqual(xmltocsv.extract_basic(parent_2LA_b, 'Mid2.Bot').text, 'decoy') + + # List + compare = ['TEST', 'TEST', 'TEST'] + self.assertEqual([ele.text for ele in xmltocsv.extract_list(parent_1LA, 'ARTIST', 'SUBARTIST')], compare) + + def test_extract_element_fail_not_exists(self): + # Failure: Sub-element with tag does not exist within parent element + parent = self.test_root + + # Basic + self.assertEqual(xmltocsv.extract_basic(parent, 'NOT_THERE').text, "") + + # List + self.assertEqual(xmltocsv.extract_list(parent, 'NOT_THERE', 'STILL_NOT_THERE').text, "") + + def test_extract_element_fail_duplicate_key(self): + # Failure: Multiple sub-elements found with tag within parent element + parent = self.test_root + + # Basic + self.assertRaises(AmbiguousElement, xmltocsv.extract_basic, parent, 'SUBARTIST') + + # List + self.assertRaises(AmbiguousElement, xmltocsv.extract_list, parent, 'SUBARTIST', 'SUBSUBARTIST') + + def test_extract_element_failure_wrong_type(self): + parent = self.test_root + + # Basic + self.assertRaises(WrongElementType, xmltocsv.extract_basic, parent, 'ARTIST') + + # List + self.assertRaises(WrongElementType, xmltocsv.extract_list, self.tree.getroot(), 'CD', 'TRACK') + + def test_validate_extract_element(self): + parent = self.test_root + + # Does not exist + found_elements = xmltocsv.find_all_rec(parent, 'NOT_THERE') + self.assertRaises(MissingElement, xmltocsv.validate_extract_element, found_elements) + + # Duplicate tag + found_elements = xmltocsv.find_all_rec(parent, 'SUBARTIST') + self.assertRaises(AmbiguousElement, xmltocsv.validate_extract_element, found_elements)