diff --git a/imageParser.py b/imageParser.py index e3e6722..69886bd 100644 --- a/imageParser.py +++ b/imageParser.py @@ -2,6 +2,8 @@ import simplejson as json import scraperwiki import time +import re +import os # Get image Urls, download images, OCR and get the text, put text in database @@ -51,11 +53,14 @@ def getImage(ad_url): print(ad_img_url) return {"image_url":ad_img_url, "image_type":"image"} + elif "https://tpc.googlesyndication.com/sadbundle/" in ad_js: start = ad_js.index("https://tpc.googlesyndication.com/sadbundle/") ad_img_url = ad_js[start:] ad_img_url = ad_img_url.split(";frame-src")[0] + print(ad_img_url) + return {"image_url":ad_img_url, "image_type":"html"} else: @@ -80,51 +85,112 @@ def addImageUrls(): row['image_type'] = image_results['image_type'] scraperwiki.sqlite.save(unique_keys=["Ad_ID"], data=row, table_name="aus_ads") time.sleep(0.1) - + + +def parse_HTML(url): + + #extracting html object and storing it as text in page + r = requests.get(url) + page = r.text + + #removing unneeded characters + page = page.replace('"','') + + #defining regex pattern to find url endings + pattern = re.compile("localUrl:(.*?)}") + + #applying pattern over page + sub_urls = list(re.findall(pattern, page)) + + return sub_urls + + + +url_i = "https://tpc.googlesyndication.com/sadbundle/$csp%3Der3$/3760334257784916408/index.html" +url = "https://tpc.googlesyndication.com/simgad/11634816136131995676" def downloadImage(url): # Check if the content has an image extension or is HTML - - ext = url.split(".")[-1] - print(ext) + + ext = url.split(".")[-1] + print(ext) # If it is an image + + if ext in images: - if ext in images: - - print("yeh") + print("yeh") - r = requests.get(url) - - img_name = url.split("/")[-1] - - with open(f'adimages/{img_name}', 'wb') as f: - f.write(r.content) + r = requests.get(url) + + img_name = url.split("/")[-1] + + with open(f'adimages/{img_name}', 'wb') as f: + f.write(r.content) - return img_name - - # If it is HTML - - else: - - # print("nah") - r = requests.get(url) - - img_name = url.split("/")[-1] - - def getFileExtension(contentType): - return contentType.split("/")[-1] - - ext = getFileExtension(r.headers['Content-Type']) - - if ext in images: - with open(f'adimages/{img_name}.{ext}', 'wb') as f: - f.write(r.content) - - return f'{img_name}.{ext}' - else: - print("Probs not an image") - return None + return img_name + + # If it is HTML + + else: + + + r = requests.get(url) + + img_name = url.split("/")[-1] + + def getFileExtension(contentType): + return contentType.split("/")[-1] + + ext = getFileExtension(r.headers['Content-Type']) + + #this keeps giving errors unles I makedir every time (idk) + filename = "adimages/{img_name}" + os.makedirs(os.path.dirname(filename), exist_ok = True) + + if ext in images: + with open(f'adimages/{img_name}.{ext}', 'wb') as f: + f.write(r.content) + + return f'{img_name}.{ext}' + + + elif ext == "html": + #initialising required url suffix + urls = parse_HTML(url) + + #replacing index suffix with needed media suffix + new_url = url.replace("index.html", "media/") + + #this indexing should be generalised (remove hard coding) + img_name = url.split("/")[-2] + print(img_name) + + #new ext + ext = urls[1].split(".")[-1] + + + + #need to update this to identify image type for extension before loop + #is currently just sampling the urls list + for i in range(len(urls)): + + #I couldnt see this being created anywhere so did it here + filename = "adimages/{img_name}" + os.makedirs(os.path.dirname(filename), exist_ok = True) + + with open(f'adimages/{img_name}/img_{i}_{len(urls)}.{ext}', 'wb') as f: + #concantenating image sub_url to new base url + sub_url = new_url + urls[i] + r = requests.get(sub_url) + f.write(r.content) + + return img_name + + else: + + print("Probs not an image") + return None def getImages(): queryString = "* from aus_ads where Ad_Type='Image' AND image_type IS NOT NULL AND image_name IS NULL" diff --git a/ocrImages.py b/ocrImages.py index 086b3fa..9e2ea6e 100644 --- a/ocrImages.py +++ b/ocrImages.py @@ -5,50 +5,100 @@ import time from PIL import Image -def readImage(fileName): - print("Converting {fileName} to greyscale".format(fileName=fileName)) - filePath = "adimages/" + fileName - - if ".gif" in fileName: - text = "" - img = Image.open(filePath) - for frame in range(0,img.n_frames): - img.seek(frame) - imgrgb = img.convert('RGBA') - # imgrgb.show() - text = text + pytesseract.image_to_string(imgrgb) - print(text) - return text - else: - image = cv2.imread(filePath) - gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) - ext = fileName.split(".")[1] - tempFile = "temp." + ext - cv2.imwrite(tempFile, gray) - text = pytesseract.image_to_string(Image.open(tempFile)) - os.remove(tempFile) - print(text) - return text +#the html part of this function is not picking up the text in the png images +#I think may be because they are transparent, need to change background with cv2 +#otherwise everything should work +def readImage(fileName, val = None): + + print("Converting {fileName} to greyscale".format(fileName=fileName)) + filePath = "adimages/" + fileName + + + def process_img(filePath): + image = cv2.imread(filePath) + gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) + ext = filePath.split(".")[1] + tempFile = "temp." + ext + cv2.imwrite(tempFile, gray) + text = pytesseract.image_to_string(Image.open(tempFile)) + os.remove(tempFile) + print(text) + return text + + if ".gif" in fileName: + text = "" + img = Image.open(filePath) + + for frame in range(0,img.n_frames): + + img.seek(frame) + imgrgb = img.convert('RGBA') + # imgrgb.show() + text = text + pytesseract.image_to_string(imgrgb) + + print(text) + + return text + + + elif val and val in "html": + + text = "" + + for img in os.listdir(filePath): + print(img) + path_new = os.path.join(filePath, img) + text += process_img(path_new) + + return text + + + else: + return process_img(filePath) + + +#this can be merged with ocrImages, just seperated them for ease of construction +def ocr_HTML(test): + queryString = "* from aus_ads where image_type='html' AND image_text is NULL" + queryResult = scraperwiki.sqlite.select(queryString) + + for row in queryResult: + val = row["image_type"] + row['image_text'] = "" + + if row['image_name'] != "": + print(row['image_name']) + row['image_text'] = readImage(row['image_name'], val) + # except Exception as e: + # row['imageText'] = "" + # print("error") + # print(e) + + + time.sleep(0.1) + if not test: + scraperwiki.sqlite.save(unique_keys=["Ad_ID"], data=row, table_name="aus_ads") + def ocrImages(test): - - queryString = "* from aus_ads where image_type='image' AND image_text is NULL" - queryResult = scraperwiki.sqlite.select(queryString) - - for row in queryResult: - row['image_text'] = "" - - if row['image_name'] != "": - print(row['image_name']) - row['image_text'] = readImage(row['image_name']) - # except Exception as e: - # row['imageText'] = "" - # print("error") - # print(e) - - time.sleep(0.1) - if not test: - scraperwiki.sqlite.save(unique_keys=["Ad_ID"], data=row, table_name="aus_ads") - + + queryString = "* from aus_ads where image_type='image' AND image_text is NULL" + queryResult = scraperwiki.sqlite.select(queryString) + + for row in queryResult: + row['image_text'] = "" + + if row['image_name'] != "": + print(row['image_name']) + row['image_text'] = readImage(row['image_name']) + # except Exception as e: + # row['imageText'] = "" + # print("error") + # print(e) + + time.sleep(0.1) + if not test: + scraperwiki.sqlite.save(unique_keys=["Ad_ID"], data=row, table_name="aus_ads") + ocrImages(False) \ No newline at end of file diff --git a/ocr_html.py b/ocr_html.py new file mode 100644 index 0000000..2a6e018 --- /dev/null +++ b/ocr_html.py @@ -0,0 +1,96 @@ +# -*- coding: utf-8 -*- +""" +Created on Tue Apr 26 13:09:12 2022 + +@author: everall +""" +import scraperwiki +import sqlite3 +import cv2 +import pandas as pd +import os +import pytesseract +import time +import simplejson as json +from PIL import Image +#import urllib2 +import requests +from bs4 import BeautifulSoup + + +os.chdir("C:\Users\everall\Documents\Python\Projects\google-ad-database-processing-scripts") + +con = sqlite3.connect("scraperwiki.sqlite") + +cur = con.cursor() +df = pd.read_sql_query("SELECT * FROM aus_ads", con) + +ip.downloadImage("https://tpc.googlesyndication.com/sadbundle/$csp%3Der3$/3760334257784916408/index.html") + +r = requests.get("https://tpc.googlesyndication.com/sadbundle/$csp%3Der3$/3760334257784916408/index.html") +page = BeautifulSoup(r.content) +page.findAll("localUrl") + + + + + + +https://transparencyreport.google.com/political-ads/advertiser/AR146135025295818752/creative/CR118339371345641472 + + + + + + + + + +ip.getImage("https://transparencyreport.google.com/political-ads/advertiser/AR146135025295818752/creative/CR118339371345641472") +ip.getImage("https://transparencyreport.google.com/political-ads/library/advertiser/AR133579976896151552/creative/CR208733795677896704") + +ad_url = "https://transparencyreport.google.com/political-ads/advertiser/AR146135025295818752/creative/CR118339371345641472" + +# url_split = ad_url.split("/") +# #can just introduce regex to find index +# ar_id = url_split[6] +# cr_id = url_split[8] +# # print(ar_id, cr_id) +# ad_api_url = f"https://transparencyreport.google.com/transparencyreport/api/v3/politicalads/creatives/details?entity_id={ar_id}&creative_id={cr_id}&hl=en" +# print(ad_api_url) +# ad_results = requests.get(ad_api_url) +# results_text = ad_results.text.replace(")]}'","").strip() +# ad_results_json = json.loads(results_text) + +def get_HTML(ad_url): + url_split = ad_url.split("/") + ar_id = url_split[5] + cr_id = url_split[7] + # print(ar_id, cr_id) + ad_api_url = f"https://transparencyreport.google.com/transparencyreport/api/v3/politicalads/creatives/details?entity_id={ar_id}&creative_id={cr_id}&hl=en" + ad_results = requests.get(ad_api_url) + results_text = ad_results.text.replace(")]}'","").strip() + ad_results_json = json.loads(results_text) + + return ad_results_json + +ad_results_json = get_HTML(ad_url) +results[0][3][4][3][3] +ad_url = ad_results_json[0][3][4][3][3] +print(ad_url) +ad_js = requests.get(ad_url).content +page = BeautifulSoup(ad_js) +page.findAll("img") + + + + + + + + + + + + +