From 6defbf65620f4185591d5f729c135073e2b74cb9 Mon Sep 17 00:00:00 2001 From: Bradley Irving <30394346+beirving@users.noreply.github.com> Date: Mon, 25 Nov 2019 15:31:01 -0700 Subject: [PATCH] File commit Code challange onn forked repo --- .gitattributes | 2 + .gitignore | 8 + .idea/.gitignore | 3 + .idea/booj-code-challenge.iml | 14 ++ .../inspectionProfiles/profiles_settings.xml | 6 + .idea/misc.xml | 4 + .idea/modules.xml | 8 + .idea/vcs.xml | 6 + LICENSE | 24 +++ main.py | 8 + parseXmlSaveCsv.py | 169 ++++++++++++++++++ requirements.txt | 19 ++ test_parseXmlSaveCsv.py | 123 +++++++++++++ 13 files changed, 394 insertions(+) create mode 100644 .gitattributes create mode 100644 .idea/.gitignore create mode 100644 .idea/booj-code-challenge.iml create mode 100644 .idea/inspectionProfiles/profiles_settings.xml create mode 100644 .idea/misc.xml create mode 100644 .idea/modules.xml create mode 100644 .idea/vcs.xml create mode 100644 LICENSE create mode 100644 main.py create mode 100644 parseXmlSaveCsv.py create mode 100644 requirements.txt create mode 100644 test_parseXmlSaveCsv.py diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..dfe0770 --- /dev/null +++ b/.gitattributes @@ -0,0 +1,2 @@ +# Auto detect text files and perform LF normalization +* text=auto diff --git a/.gitignore b/.gitignore index db1f598..b385c4b 100644 --- a/.gitignore +++ b/.gitignore @@ -98,3 +98,11 @@ ENV/ # mypy .mypy_cache/ +venv/. + +.pytest_cache + +downloaded_xml_data.xml +output.csv + + diff --git a/.idea/.gitignore b/.idea/.gitignore new file mode 100644 index 0000000..0e40fe8 --- /dev/null +++ b/.idea/.gitignore @@ -0,0 +1,3 @@ + +# Default ignored files +/workspace.xml \ No newline at end of file diff --git a/.idea/booj-code-challenge.iml b/.idea/booj-code-challenge.iml new file mode 100644 index 0000000..b6c198f --- /dev/null +++ b/.idea/booj-code-challenge.iml @@ -0,0 +1,14 @@ + + + + + + + + + + + + \ No newline at end of file diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml new file mode 100644 index 0000000..105ce2d --- /dev/null +++ b/.idea/inspectionProfiles/profiles_settings.xml @@ -0,0 +1,6 @@ + + + + \ No newline at end of file diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 0000000..ae8e745 --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,4 @@ + + + + \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 0000000..683be98 --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000..94a25f7 --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..6bb8a29 --- /dev/null +++ b/LICENSE @@ -0,0 +1,24 @@ +This is free and unencumbered software released into the public domain. + +Anyone is free to copy, modify, publish, use, compile, sell, or +distribute this software, either in source code form or as a compiled +binary, for any purpose, commercial or non-commercial, and by any +means. + +In jurisdictions that recognize copyright laws, the author or authors +of this software dedicate any and all copyright interest in the +software to the public domain. We make this dedication for the benefit +of the public at large and to the detriment of our heirs and +successors. We intend this dedication to be an overt act of +relinquishment in perpetuity of all present and future rights to this +software under copyright law. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR +OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, +ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +OTHER DEALINGS IN THE SOFTWARE. + +For more information, please refer to \ No newline at end of file diff --git a/main.py b/main.py new file mode 100644 index 0000000..b79c8ca --- /dev/null +++ b/main.py @@ -0,0 +1,8 @@ +import parseXmlSaveCsv + +# set target xml feed +challenge_url = 'http://syndication.enterprise.websiteidx.com/feeds/BoojCodeTest.xml' +# download xml feed to local file +local_file = parseXmlSaveCsv.download_file(challenge_url) +# parse xml and save as csv +parseXmlSaveCsv.parse_and_save(local_file) diff --git a/parseXmlSaveCsv.py b/parseXmlSaveCsv.py new file mode 100644 index 0000000..bc8aa67 --- /dev/null +++ b/parseXmlSaveCsv.py @@ -0,0 +1,169 @@ +import os +import csv +import urllib2 +import datetime +import collections +import xml.etree.ElementTree as eTree + + +def download_file(url, file_name='downloaded_xml_data.xml'): + """ + Download a file from an external source and save locally + :param url: str + Valid url for XML + :param file_name: str + Set location for file to be downloaded + Default CWD/downloaded_xml_data.xml + :return: str : file location + """ + source = urllib2.urlopen(url) + contents = source.read() + file_handler = open(file_name, 'w') + file_handler.write(contents) + file_handler.close() + return os.path.realpath(file_name) + + +def check_valid_year(check_date, target_year=2016): + """ + check if the date given is valid for the target year + :param check_date: str + :param target_year: str + :return: bool + """ + start = datetime.datetime(target_year, 1, 1) + end = datetime.datetime(target_year, 12, 31, 23, 59, 59) + date = datetime.datetime.strptime(check_date, '%Y-%m-%d %H:%M:%S') + if date > end: + return False + if date < start: + return False + return True + + +def check_description(description, check_term=' and '): + """ + Check if the first argument contains the 2nd argument + :param description: str + :param check_term: str + :return: bool + """ + if description.find(check_term) >= 0: + return True + return False + + +def listing_valid(listing_iterator_item): + """ + Helper function to clean up parse_file function + Runs check_valid_year and check_description under default params + :param listing_iterator_item: xml.etree.ElementTree.Element + :return: bool + """ + listing_details = listing_iterator_item.find('ListingDetails') + date_result = check_valid_year(listing_details.find('DateListed').text) + if date_result is False: + return date_result + basic_details = listing_iterator_item.find('BasicDetails') + return check_description(basic_details.find('Description').text) + + +def join_sub_nodes(main_node, term): + """ + Outputs a comma separated string of the "term" node values + :param main_node: xml.etree.ElementTree.Element + :param term: str + :return: str + """ + if main_node is None: + return '' + else: + node_items = main_node.findall(term) + if node_items is None: + return '' + else: + output_list = [] + for node_item in node_items: + output_list.append(node_item.text) + return ','.join(output_list) + + +def get_get_fields(listing_iterator_item): + """ + Collect desired node values from xml + :param listing_iterator_item: xml.etree.ElementTree.Element + :return: collections.OrderedDict + """ + output = collections.OrderedDict() + listing_details = listing_iterator_item.find('ListingDetails') + location_details = listing_iterator_item.find('Location') + basic_details = listing_iterator_item.find('BasicDetails') + rich_details = listing_iterator_item.find('RichDetails') + + output['MlsId'] = listing_details.find('MlsId').text + output['MlsName'] = listing_details.find('MlsName').text + + output['DateListed'] = listing_details.find('DateListed').text + + output['StreetAddress'] = location_details.find('StreetAddress').text + + output['Price'] = listing_details.find('Price').text + output['Bedrooms'] = basic_details.find('Bedrooms').text + + # this is producing no values + # at this point I would talk with stake holders for clarifications on which nodes are important + # lets talk about the process for getting more information on issues like this + output['Bathrooms'] = basic_details.find('Bathrooms').text + + # this would be code to switch to use FullBathrooms, HalfBathrooms, ThreeQuarterBathrooms nodes + # bathrooms = 0 + # bathrooms += int(0 if basic_details.find('FullBathrooms').text is None else basic_details.find('FullBathrooms').text) + # bathrooms += int(0 if basic_details.find('HalfBathrooms').text is None else basic_details.find('HalfBathrooms').text) + # bathrooms += int(0 if basic_details.find('ThreeQuarterBathrooms').text is None else basic_details.find('ThreeQuarterBathrooms').text) + # output['Bathrooms'] = bathrooms + # Humorous result: + # the listing for 1110 Felbar Avenue has 102 bathrooms + # due to the xml having 99 + + output['Appliances'] = join_sub_nodes(rich_details.find('Appliances'), 'Appliance') + output['Rooms '] = join_sub_nodes(rich_details.find('Rooms'), 'Room') + + # truncate to the 200th character + description = basic_details.find('Description').text + output['Description'] = description[0:200] + return output + + +def write_listing_to_csv(listing_order_dict, file_name='output'): + """ + Write the given dict data to the given csv + :param listing_order_dict: collections.OrderedDict + :param file_name: str + :return: void + """ + file_location = file_name+".csv" + write_header = True + if os.path.isfile(file_location) is True: + write_header = False + + with open(file_location, 'ab') as f: + writer = csv.writer(f) + if write_header: + writer.writerow(listing_order_dict.keys()) + writer.writerow(listing_order_dict.values()) + + +def parse_and_save(local_file_location): + """ + Iterate through given local xml file line by line and write to default csv location + :param local_file_location: str + :return: void + """ + context = eTree.iterparse(local_file_location) + for event, element in context: + if event == "end" and element.tag == 'Listings': + for listing in element: + if listing_valid(listing): + listing_fields = get_get_fields(listing) + write_listing_to_csv(listing_fields) + diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..dabec0c --- /dev/null +++ b/requirements.txt @@ -0,0 +1,19 @@ +atomicwrites==1.3.0 +attrs==19.3.0 +colorama==0.4.1 +configparser==4.0.2 +contextlib2==0.6.0.post1 +funcsigs==1.0.2 +importlib-metadata==0.23 +mock==3.0.5 +more-itertools==5.0.0 +packaging==19.2 +pathlib2==2.3.5 +pluggy==0.13.1 +py==1.8.0 +pyparsing==2.4.5 +pytest==4.6.6 +scandir==1.10.0 +six==1.13.0 +wcwidth==0.1.7 +zipp==0.6.0 diff --git a/test_parseXmlSaveCsv.py b/test_parseXmlSaveCsv.py new file mode 100644 index 0000000..047b32f --- /dev/null +++ b/test_parseXmlSaveCsv.py @@ -0,0 +1,123 @@ +import os +import xml +import pytest +import collections +import parseXmlSaveCsv +from mock import Mock, patch + + +@pytest.fixture(scope='function') +def mock_xml_element(): + mock_element = Mock(spec=xml.etree.ElementTree.Element) + mock_text = Mock() + mock_text.text = 'string' + mock_element.find.return_value = mock_text + mock_element.tag.return_value = 'Listings' + return mock_element + + +@patch('parseXmlSaveCsv.urllib2.urlopen') +def test_download_file_will_pass(mock_urlopen): + test_path = os.path.dirname(os.path.realpath(__file__))+'\\test.xml' + mock = Mock() + mock.read.side_effect = ['mock_contents1'] + mock_urlopen.return_value = mock + result = parseXmlSaveCsv.download_file('http://test.com', 'test.xml') + assert type(result) is str + assert result == test_path + os.remove(result) + + +@patch('parseXmlSaveCsv.urllib2.urlopen') +def test_download_file_will_fail(mock_urlopen): + test_path = os.path.dirname(os.path.realpath(__file__))+'\\test_fail.xml' + mock = Mock() + mock.read.side_effect = ['mock_contents1'] + mock_urlopen.return_value = mock + result = parseXmlSaveCsv.download_file('http://test.com', 'test.xml') + assert result is not test_path + os.remove(result) + + +def test_check_description_will_pass(): + result = parseXmlSaveCsv.check_description('this test will pass', 'pass') + assert result + + +def test_check_description_will_fail(): + result = parseXmlSaveCsv.check_description('this test will pass', 'fail') + assert result is False + + +def test_check_valid_year_will_pass(): + result = parseXmlSaveCsv.check_valid_year('2019-05-15 13:29:45', 2019) + assert result + + +def test_check_valid_year_will_fail(): + result = parseXmlSaveCsv.check_valid_year('2019-05-15 13:29:45', 2018) + assert result is False + + +@patch("parseXmlSaveCsv.check_valid_year", return_value=True) +@patch("parseXmlSaveCsv.check_description", return_value=True) +def test_listing_valid_will_pass(mock_check_year, mock_check_description, mock_xml_element): + result = parseXmlSaveCsv.listing_valid(mock_xml_element) + assert mock_check_year.called + assert mock_check_description.called + assert result + + +@patch("parseXmlSaveCsv.check_valid_year", return_value=False) +def test_listing_valid_will_fail_check_year(mock_check_year, mock_xml_element): + result = parseXmlSaveCsv.listing_valid(mock_xml_element) + assert mock_check_year.called + assert result is False + + +@patch("parseXmlSaveCsv.check_valid_year", return_value=True) +@patch("parseXmlSaveCsv.check_description", return_value=False) +def test_listing_valid_will_fail_check_description(mock_check_year, mock_check_description, mock_xml_element): + result = parseXmlSaveCsv.listing_valid(mock_xml_element) + assert mock_check_year.called + assert mock_check_description.called + assert result is False + + +def test_join_sub_nodes_will_pass(mock_xml_element): + mock_text = Mock() + mock_text.text = 'string' + + mock_xml_element.findall.return_value = [mock_text, mock_text] + + result = parseXmlSaveCsv.join_sub_nodes(mock_xml_element, 'test') + assert result == 'string,string' + + +def test_join_sub_nodes_will_fail_main_node_is_none(): + result = parseXmlSaveCsv.join_sub_nodes(None, 'test') + assert result == '' + + +def test_join_sub_nodes_will_fail_main_node_find_all_is_none(mock_xml_element): + mock_xml_element.findall.return_value = None + result = parseXmlSaveCsv.join_sub_nodes(mock_xml_element, 'test') + assert result == '' + + +@patch("parseXmlSaveCsv.join_sub_nodes", return_value='string') +def test_get_fields_will_pass(mock_join_sub_nodes, mock_xml_element): + mock_element = Mock(spec=xml.etree.ElementTree.Element) + mock_element.find.return_value = mock_xml_element + result = parseXmlSaveCsv.get_get_fields(mock_element) + assert mock_join_sub_nodes.called + assert type(result) is collections.OrderedDict + + +def test_write_listing_to_csv_will_pass(): + test_path = os.path.dirname(os.path.realpath(__file__)) + '\\test.csv' + mock_dict = collections.OrderedDict() + mock_dict['header'] = 'value' + parseXmlSaveCsv.write_listing_to_csv(mock_dict, 'test') + assert os.path.isfile('test.csv') + os.remove(test_path)