guardian · foroveralls · Apr 26, 2022
diff --git a/imageParser.py b/imageParser.py
@@ -2,6 +2,8 @@
 import simplejson as json
 import scraperwiki
 import time
+import re 
+import os
 
 # Get image Urls, download images, OCR and get the text, put text in database
 
@@ -51,11 +53,14 @@ def getImage(ad_url):
 
 			print(ad_img_url)
 			return {"image_url":ad_img_url, "image_type":"image"}
+
 		elif "https://tpc.googlesyndication.com/sadbundle/" in ad_js:
 			start = ad_js.index("https://tpc.googlesyndication.com/sadbundle/")
 			ad_img_url = ad_js[start:]
 			ad_img_url = ad_img_url.split(";frame-src")[0]
+
 			print(ad_img_url)
+
 			return {"image_url":ad_img_url, "image_type":"html"}
 
 		else:
@@ -80,51 +85,112 @@ def addImageUrls():
 		row['image_type'] = image_results['image_type']
 		scraperwiki.sqlite.save(unique_keys=["Ad_ID"], data=row, table_name="aus_ads")
 		time.sleep(0.1)
-
+
+
+def parse_HTML(url):
+
+    #extracting html object and storing it as text in page 
+    r = requests.get(url)
+    page = r.text
+
+    #removing unneeded characters
+    page = page.replace('"','')
+
+    #defining regex pattern to find url endings
+    pattern = re.compile("localUrl:(.*?)}")
+
+    #applying pattern over page
+    sub_urls = list(re.findall(pattern, page))
+
+    return sub_urls
+
+
+
+url_i = "https://tpc.googlesyndication.com/sadbundle/$csp%3Der3$/3760334257784916408/index.html"
+url = "https://tpc.googlesyndication.com/simgad/11634816136131995676"
 
 def downloadImage(url):
 
 	# Check if the content has an image extension or is HTML
-
-	ext = url.split(".")[-1]
-	print(ext)
+    
+    ext = url.split(".")[-1]
+    print(ext)
 	# If it is an image
+
+    if ext in images:
 
-	if ext in images:
-
-		print("yeh")
+        print("yeh")
 
-		r = requests.get(url)
-
-		img_name = url.split("/")[-1]
-
-		with open(f'adimages/{img_name}', 'wb') as f:
-		    f.write(r.content)
+        r = requests.get(url)
+        
+        img_name = url.split("/")[-1]
+        
+        with open(f'adimages/{img_name}', 'wb') as f:
+            f.write(r.content)
 
-		return img_name    
-
-	# If it is HTML	
-
-	else:
-
-		# print("nah")
-		r = requests.get(url)
-
-		img_name = url.split("/")[-1]
-
-		def getFileExtension(contentType):
-			return contentType.split("/")[-1]
-
-		ext = getFileExtension(r.headers['Content-Type'])
-
-		if ext in images:
-			with open(f'adimages/{img_name}.{ext}', 'wb') as f:
-			    f.write(r.content)
-
-			return f'{img_name}.{ext}'
-		else:
-			print("Probs not an image")	    
-			return None
+        return img_name    
+
+	# If it is HTML
+
+    else:
+
+
+        r = requests.get(url)
+
+        img_name = url.split("/")[-1]
+
+        def getFileExtension(contentType):
+             return contentType.split("/")[-1]
+
+        ext = getFileExtension(r.headers['Content-Type'])
+
+        #this keeps giving errors unles I makedir every time (idk)
+        filename = "adimages/{img_name}"
+        os.makedirs(os.path.dirname(filename), exist_ok = True)
+
+        if ext in images:
+            with open(f'adimages/{img_name}.{ext}', 'wb') as f:
+                f.write(r.content)
+
+            return f'{img_name}.{ext}'
+
+
+        elif ext == "html": 
+            #initialising required url suffix
+            urls = parse_HTML(url)
+
+            #replacing index suffix with needed media suffix
+            new_url = url.replace("index.html", "media/")
+
+            #this indexing should be generalised (remove hard coding)
+            img_name = url.split("/")[-2]
+            print(img_name)
+
+            #new ext 
+            ext = urls[1].split(".")[-1]
+
+
+
+            #need to update this to identify image type for extension before loop
+            #is currently just sampling the urls list
+            for i in range(len(urls)):
+
+                #I couldnt see this being created anywhere so did it here
+                filename = "adimages/{img_name}"
+                os.makedirs(os.path.dirname(filename), exist_ok = True)
+
+                with open(f'adimages/{img_name}/img_{i}_{len(urls)}.{ext}', 'wb') as f:
+                #concantenating image sub_url to new base url
+                    sub_url = new_url + urls[i]
+                    r = requests.get(sub_url)
+                    f.write(r.content)
+
+            return img_name
+
+        else:
+
+            print("Probs not an image")	    
+            return None
 
 def getImages():
 	queryString = "* from aus_ads where Ad_Type='Image' AND image_type IS NOT NULL AND image_name IS NULL"

diff --git a/ocrImages.py b/ocrImages.py
@@ -5,50 +5,100 @@
 import time
 from PIL import Image
 
-def readImage(fileName):
-	print("Converting {fileName} to greyscale".format(fileName=fileName))
-	filePath = "adimages/" + fileName
-
-	if ".gif" in fileName:
-		text = ""
-		img = Image.open(filePath)
-		for frame in range(0,img.n_frames):
-			img.seek(frame)
-			imgrgb = img.convert('RGBA')
-			# imgrgb.show()
-			text = text + pytesseract.image_to_string(imgrgb)
-			print(text)
-		return text	
-	else:	
-		image = cv2.imread(filePath)
-		gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
-		ext = fileName.split(".")[1]
-		tempFile = "temp." + ext
-		cv2.imwrite(tempFile, gray)
-		text = pytesseract.image_to_string(Image.open(tempFile))
-		os.remove(tempFile)
-		print(text)
-		return text
 
+#the html part of this function is not picking up the text in the png images
+#I think may be because they are transparent, need to change background with cv2 
+#otherwise everything should work
+def readImage(fileName, val = None):
+
+    print("Converting {fileName} to greyscale".format(fileName=fileName))
+    filePath = "adimages/" + fileName
+
+
+    def process_img(filePath):
+        image = cv2.imread(filePath)
+        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
+        ext = filePath.split(".")[1]
+        tempFile = "temp." + ext
+        cv2.imwrite(tempFile, gray)
+        text = pytesseract.image_to_string(Image.open(tempFile))
+        os.remove(tempFile)
+        print(text)
+        return text
+
+    if ".gif" in fileName:
+        text = ""
+        img = Image.open(filePath)
+
+        for frame in range(0,img.n_frames):
+
+            img.seek(frame)
+            imgrgb = img.convert('RGBA')
+            # imgrgb.show()
+            text = text + pytesseract.image_to_string(imgrgb)
+
+            print(text)
+
+        return text    
+
+
+    elif val and val in "html":
+
+        text = ""
+
+        for img in os.listdir(filePath):
+            print(img)
+            path_new = os.path.join(filePath, img)
+            text += process_img(path_new)
+
+        return text
+
+
+    else: 
+        return process_img(filePath)
+
+
+#this can be merged with ocrImages, just seperated them for ease of construction
+def ocr_HTML(test):
+    queryString = "* from aus_ads where image_type='html' AND image_text is NULL"
+    queryResult = scraperwiki.sqlite.select(queryString)
+
+    for row in queryResult:
+        val = row["image_type"]
+        row['image_text'] = ""
+
+        if row['image_name'] != "":
+            print(row['image_name'])
+            row['image_text'] = readImage(row['image_name'], val)
+                # except Exception as e:
+                #     row['imageText'] = ""
+                #     print("error")
+                #     print(e)
+
+
+        time.sleep(0.1)
+        if not test:        
+            scraperwiki.sqlite.save(unique_keys=["Ad_ID"], data=row, table_name="aus_ads")
+
 def ocrImages(test):
-	
-	queryString = "* from aus_ads where image_type='image' AND image_text is NULL"
-	queryResult = scraperwiki.sqlite.select(queryString)
-
-	for row in queryResult:
-		row['image_text'] = ""
-
-		if row['image_name'] != "":
-				print(row['image_name'])
-				row['image_text'] = readImage(row['image_name'])
-				# except Exception as e:
-				# 	row['imageText'] = ""
-				# 	print("error")
-				# 	print(e)
-
-		time.sleep(0.1)
-		if not test:		
-			scraperwiki.sqlite.save(unique_keys=["Ad_ID"], data=row, table_name="aus_ads")
-			
+    
+    queryString = "* from aus_ads where image_type='image' AND image_text is NULL"
+    queryResult = scraperwiki.sqlite.select(queryString)
+
+    for row in queryResult:
+        row['image_text'] = ""
+
+        if row['image_name'] != "":
+                print(row['image_name'])
+                row['image_text'] = readImage(row['image_name'])
+                # except Exception as e:
+                #     row['imageText'] = ""
+                #     print("error")
+                #     print(e)
+
+        time.sleep(0.1)
+        if not test:        
+            scraperwiki.sqlite.save(unique_keys=["Ad_ID"], data=row, table_name="aus_ads")
+            
 
 ocrImages(False)