-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathsql_codes.py
More file actions
65 lines (48 loc) · 1.96 KB
/
sql_codes.py
File metadata and controls
65 lines (48 loc) · 1.96 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
# This Python file uses the following encoding: utf-8
# import libraries
from bs4 import BeautifulSoup
import soup_engine
import profile
import constants
import progress_bar
# Code taken from the tutorial here https://medium.freecodecamp.org/how-to-scrape-websites-with-python-and-beautifulsoup-5946935d93fe
# and here https://stanford.edu/~mgorkove/cgi-bin/rpython_tutorials/Scraping_a_Webpage_Rendered_by_Javascript_Using_Python.php
page_url = 'https://www.ibm.com/docs/en/db2-for-zos/13?topic=codes-error-sql'
url_base = "https://www.ibm.com/docs/en/SSEPEK_13.0.0/codes/"
############ PROGRAM ############
# Program profiling
profiler.start()
# Array to store the reason code dictionaries once they have been scraped
sql_code_dictionaries = []
#### START SCRAPE ####
print('Scraping: ' + page_url)
# Get the main page of the reason codes parsed into a BeautifulSoup instance
main_page_soup = soup_engine.soup_from_url(page_url)
# Find all links to reason code groups
main_page_link_spans = main_page_soup.find_all(
'li', class_="ulchildlink")
index = 1
total_length = len(main_page_link_spans)
for span in main_page_link_spans:
a_tag = span.find("a")
sql_code_url = a_tag['href'].replace("../../", "")
# Load group page and get soup
url = url_base + sql_code_url
soup = soup_engine.soup_from_url(url)
# Parse code page
parsed_tags = soup_engine.dict_from_soup(soup, constants.PARSE_DICT)
parsed_tags[constants.DICT_URL] = url
# Append to array
sql_code_dictionaries.append(parsed_tags)
progress_bar.print(index, total_length, 'Progress',
'(' + str(index) + '/' + str(total_length) + ') Pages Scraped')
index += 1
# Write to file
file_name = 'sql_codes'
print('Writing parsed codes to file with name ' + file_name)
soup_engine.write_soup_engine_dict_to_files(
sql_code_dictionaries, constants.PARSE_DICT, file_name)
#### END SCRAPE ####
# End Program profiling
profiler.end()
############ END PROGRAM ############