Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
140 changes: 103 additions & 37 deletions imageParser.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
import simplejson as json
import scraperwiki
import time
import re
import os

# Get image Urls, download images, OCR and get the text, put text in database

Expand Down Expand Up @@ -51,11 +53,14 @@ def getImage(ad_url):

print(ad_img_url)
return {"image_url":ad_img_url, "image_type":"image"}

elif "https://tpc.googlesyndication.com/sadbundle/" in ad_js:
start = ad_js.index("https://tpc.googlesyndication.com/sadbundle/")
ad_img_url = ad_js[start:]
ad_img_url = ad_img_url.split(";frame-src")[0]

print(ad_img_url)

return {"image_url":ad_img_url, "image_type":"html"}

else:
Expand All @@ -80,51 +85,112 @@ def addImageUrls():
row['image_type'] = image_results['image_type']
scraperwiki.sqlite.save(unique_keys=["Ad_ID"], data=row, table_name="aus_ads")
time.sleep(0.1)



def parse_HTML(url):

#extracting html object and storing it as text in page
r = requests.get(url)
page = r.text

#removing unneeded characters
page = page.replace('"','')

#defining regex pattern to find url endings
pattern = re.compile("localUrl:(.*?)}")

#applying pattern over page
sub_urls = list(re.findall(pattern, page))

return sub_urls



url_i = "https://tpc.googlesyndication.com/sadbundle/$csp%3Der3$/3760334257784916408/index.html"
url = "https://tpc.googlesyndication.com/simgad/11634816136131995676"

def downloadImage(url):

# Check if the content has an image extension or is HTML

ext = url.split(".")[-1]
print(ext)
ext = url.split(".")[-1]
print(ext)
# If it is an image

if ext in images:

if ext in images:

print("yeh")
print("yeh")

r = requests.get(url)

img_name = url.split("/")[-1]

with open(f'adimages/{img_name}', 'wb') as f:
f.write(r.content)
r = requests.get(url)
img_name = url.split("/")[-1]
with open(f'adimages/{img_name}', 'wb') as f:
f.write(r.content)

return img_name

# If it is HTML

else:

# print("nah")
r = requests.get(url)

img_name = url.split("/")[-1]

def getFileExtension(contentType):
return contentType.split("/")[-1]

ext = getFileExtension(r.headers['Content-Type'])

if ext in images:
with open(f'adimages/{img_name}.{ext}', 'wb') as f:
f.write(r.content)

return f'{img_name}.{ext}'
else:
print("Probs not an image")
return None
return img_name

# If it is HTML

else:


r = requests.get(url)

img_name = url.split("/")[-1]

def getFileExtension(contentType):
return contentType.split("/")[-1]

ext = getFileExtension(r.headers['Content-Type'])

#this keeps giving errors unles I makedir every time (idk)
filename = "adimages/{img_name}"
os.makedirs(os.path.dirname(filename), exist_ok = True)

if ext in images:
with open(f'adimages/{img_name}.{ext}', 'wb') as f:
f.write(r.content)

return f'{img_name}.{ext}'


elif ext == "html":
#initialising required url suffix
urls = parse_HTML(url)

#replacing index suffix with needed media suffix
new_url = url.replace("index.html", "media/")

#this indexing should be generalised (remove hard coding)
img_name = url.split("/")[-2]
print(img_name)

#new ext
ext = urls[1].split(".")[-1]



#need to update this to identify image type for extension before loop
#is currently just sampling the urls list
for i in range(len(urls)):

#I couldnt see this being created anywhere so did it here
filename = "adimages/{img_name}"
os.makedirs(os.path.dirname(filename), exist_ok = True)

with open(f'adimages/{img_name}/img_{i}_{len(urls)}.{ext}', 'wb') as f:
#concantenating image sub_url to new base url
sub_url = new_url + urls[i]
r = requests.get(sub_url)
f.write(r.content)

return img_name

else:

print("Probs not an image")
return None

def getImages():
queryString = "* from aus_ads where Ad_Type='Image' AND image_type IS NOT NULL AND image_name IS NULL"
Expand Down
136 changes: 93 additions & 43 deletions ocrImages.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,50 +5,100 @@
import time
from PIL import Image

def readImage(fileName):
print("Converting {fileName} to greyscale".format(fileName=fileName))
filePath = "adimages/" + fileName

if ".gif" in fileName:
text = ""
img = Image.open(filePath)
for frame in range(0,img.n_frames):
img.seek(frame)
imgrgb = img.convert('RGBA')
# imgrgb.show()
text = text + pytesseract.image_to_string(imgrgb)
print(text)
return text
else:
image = cv2.imread(filePath)
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
ext = fileName.split(".")[1]
tempFile = "temp." + ext
cv2.imwrite(tempFile, gray)
text = pytesseract.image_to_string(Image.open(tempFile))
os.remove(tempFile)
print(text)
return text

#the html part of this function is not picking up the text in the png images
#I think may be because they are transparent, need to change background with cv2
#otherwise everything should work
def readImage(fileName, val = None):

print("Converting {fileName} to greyscale".format(fileName=fileName))
filePath = "adimages/" + fileName


def process_img(filePath):
image = cv2.imread(filePath)
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
ext = filePath.split(".")[1]
tempFile = "temp." + ext
cv2.imwrite(tempFile, gray)
text = pytesseract.image_to_string(Image.open(tempFile))
os.remove(tempFile)
print(text)
return text

if ".gif" in fileName:
text = ""
img = Image.open(filePath)

for frame in range(0,img.n_frames):

img.seek(frame)
imgrgb = img.convert('RGBA')
# imgrgb.show()
text = text + pytesseract.image_to_string(imgrgb)

print(text)

return text


elif val and val in "html":

text = ""

for img in os.listdir(filePath):
print(img)
path_new = os.path.join(filePath, img)
text += process_img(path_new)

return text


else:
return process_img(filePath)


#this can be merged with ocrImages, just seperated them for ease of construction
def ocr_HTML(test):
queryString = "* from aus_ads where image_type='html' AND image_text is NULL"
queryResult = scraperwiki.sqlite.select(queryString)

for row in queryResult:
val = row["image_type"]
row['image_text'] = ""

if row['image_name'] != "":
print(row['image_name'])
row['image_text'] = readImage(row['image_name'], val)
# except Exception as e:
# row['imageText'] = ""
# print("error")
# print(e)


time.sleep(0.1)
if not test:
scraperwiki.sqlite.save(unique_keys=["Ad_ID"], data=row, table_name="aus_ads")

def ocrImages(test):
queryString = "* from aus_ads where image_type='image' AND image_text is NULL"
queryResult = scraperwiki.sqlite.select(queryString)

for row in queryResult:
row['image_text'] = ""

if row['image_name'] != "":
print(row['image_name'])
row['image_text'] = readImage(row['image_name'])
# except Exception as e:
# row['imageText'] = ""
# print("error")
# print(e)

time.sleep(0.1)
if not test:
scraperwiki.sqlite.save(unique_keys=["Ad_ID"], data=row, table_name="aus_ads")
queryString = "* from aus_ads where image_type='image' AND image_text is NULL"
queryResult = scraperwiki.sqlite.select(queryString)

for row in queryResult:
row['image_text'] = ""

if row['image_name'] != "":
print(row['image_name'])
row['image_text'] = readImage(row['image_name'])
# except Exception as e:
# row['imageText'] = ""
# print("error")
# print(e)

time.sleep(0.1)
if not test:
scraperwiki.sqlite.save(unique_keys=["Ad_ID"], data=row, table_name="aus_ads")

ocrImages(False)
Loading