-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathreason_codes.py
More file actions
77 lines (62 loc) · 2.4 KB
/
reason_codes.py
File metadata and controls
77 lines (62 loc) · 2.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
# This Python file uses the following encoding: utf-8
# import libraries
import soup_engine
import profiler
import constants
import progress_bar
PAGE_URL = 'https://www.ibm.com/docs/en/db2-for-zos/13?topic=codes-db2-reason'
URL_BASE = "https://www.ibm.com/docs/en/SSEPEK_13.0.0/codes/"
# Program profiling. Save starting time
profiler.start()
# Get the main page of the reason codes parsed into a BeautifulSoup instance
main_page_soup = soup_engine.soup_from_url(PAGE_URL)
# Find all links to reason code groups
main_page_link_spans = main_page_soup.find_all(
'li', class_="ulchildlink")
# Array for each of the reason code pages
reason_code_urls = []
# Progress variables
INDEX = 1
total_length = len(main_page_link_spans)
print('Scrape Reason code urls')
# Loop through main page reason code groups
for span in main_page_link_spans:
a_tag = span.find("a")
group_url = a_tag['href'].replace("../../", "")
# Load group page and get soup
child_page_soup = soup_engine.soup_from_url(
URL_BASE + group_url)
# Get the links to the individual code pages
child_page_link_spans = child_page_soup.find_all(
'li', class_="ulchildlink")
for span in child_page_link_spans:
a_tag = span.find("a")
reason_code_url = a_tag['href'].replace("../../", "")
url = URL_BASE + reason_code_url
reason_code_urls.append(url)
progress_bar.print(INDEX, total_length, 'Progress',
'(' + str(INDEX) + '/' + str(total_length) + ') URLs Scraped')
INDEX += 1
# Array to store the reason code dictionaries once they have been scraped
reason_code_dictionaries = []
INDEX = 1
TOTAL_LENGTH = len(reason_code_urls)
print('Scrape Pages')
# Scrape the reason code pages=
for url in reason_code_urls:
# Load group page and get soup
soup = soup_engine.soup_from_url(url)
# Parse code page
parsed_tags = soup_engine.dict_from_soup(soup, constants.PARSE_DICT)
parsed_tags[constants.DICT_URL] = url
# Append to array
reason_code_dictionaries.append(parsed_tags)
progress_bar.print(INDEX, TOTAL_LENGTH, 'Progress',
'(' + str(INDEX) + '/' + str(TOTAL_LENGTH) + ') Pages Scraped')
INDEX += 1
# Write to file
FILE_NAME = 'reason_codes'
print('Writing parsed codes to file with name ' + FILE_NAME)
soup_engine.write_soup_engine_dict_to_files(
reason_code_dictionaries, constants.PARSE_DICT, FILE_NAME)
profiler.end()