diff --git a/imageParser.py b/imageParser.py
index e3e6722..69886bd 100644
--- a/imageParser.py
+++ b/imageParser.py
@@ -2,6 +2,8 @@
 import simplejson as json
 import scraperwiki
 import time
+import re 
+import os
 
 # Get image Urls, download images, OCR and get the text, put text in database
 
@@ -51,11 +53,14 @@ def getImage(ad_url):
 
 			print(ad_img_url)
 			return {"image_url":ad_img_url, "image_type":"image"}
+        
 		elif "https://tpc.googlesyndication.com/sadbundle/" in ad_js:
 			start = ad_js.index("https://tpc.googlesyndication.com/sadbundle/")
 			ad_img_url = ad_js[start:]
 			ad_img_url = ad_img_url.split(";frame-src")[0]
+            
 			print(ad_img_url)
+            
 			return {"image_url":ad_img_url, "image_type":"html"}
 				
 		else:
@@ -80,51 +85,112 @@ def addImageUrls():
 		row['image_type'] = image_results['image_type']
 		scraperwiki.sqlite.save(unique_keys=["Ad_ID"], data=row, table_name="aus_ads")
 		time.sleep(0.1)
-
+        
+        
+def parse_HTML(url):
+    
+    #extracting html object and storing it as text in page 
+    r = requests.get(url)
+    page = r.text
+    
+    #removing unneeded characters
+    page = page.replace('"','')
+            
+    #defining regex pattern to find url endings
+    pattern = re.compile("localUrl:(.*?)}")
+            
+    #applying pattern over page
+    sub_urls = list(re.findall(pattern, page))
+    
+    return sub_urls
+
+
+
+url_i = "https://tpc.googlesyndication.com/sadbundle/$csp%3Der3$/3760334257784916408/index.html"
+url = "https://tpc.googlesyndication.com/simgad/11634816136131995676"
 
 def downloadImage(url):
 
 	# Check if the content has an image extension or is HTML
-
-	ext = url.split(".")[-1]
-	print(ext)
+    
+    ext = url.split(".")[-1]
+    print(ext)
 	# If it is an image
+    
+    if ext in images:
 
-	if ext in images:
-
-		print("yeh")
+        print("yeh")
 
-		r = requests.get(url)
-
-		img_name = url.split("/")[-1]
-
-		with open(f'adimages/{img_name}', 'wb') as f:
-		    f.write(r.content)
+        r = requests.get(url)
+        
+        img_name = url.split("/")[-1]
+        
+        with open(f'adimages/{img_name}', 'wb') as f:
+            f.write(r.content)
 	
-		return img_name    
-
-	# If it is HTML	
-
-	else:
-
-		# print("nah")
-		r = requests.get(url)
-	
-		img_name = url.split("/")[-1]
-
-		def getFileExtension(contentType):
-			return contentType.split("/")[-1]
-
-		ext = getFileExtension(r.headers['Content-Type'])
-
-		if ext in images:
-			with open(f'adimages/{img_name}.{ext}', 'wb') as f:
-			    f.write(r.content)
-
-			return f'{img_name}.{ext}'
-		else:
-			print("Probs not an image")	    
-			return None
+        return img_name    
+
+	# If it is HTML
+
+    else:
+        
+        
+        r = requests.get(url)
+        
+        img_name = url.split("/")[-1]
+        
+        def getFileExtension(contentType):
+             return contentType.split("/")[-1]
+
+        ext = getFileExtension(r.headers['Content-Type'])
+        
+        #this keeps giving errors unles I makedir every time (idk)
+        filename = "adimages/{img_name}"
+        os.makedirs(os.path.dirname(filename), exist_ok = True)
+        
+        if ext in images:
+            with open(f'adimages/{img_name}.{ext}', 'wb') as f:
+                f.write(r.content)
+                
+            return f'{img_name}.{ext}'
+        
+
+        elif ext == "html": 
+            #initialising required url suffix
+            urls = parse_HTML(url)
+        
+            #replacing index suffix with needed media suffix
+            new_url = url.replace("index.html", "media/")
+            
+            #this indexing should be generalised (remove hard coding)
+            img_name = url.split("/")[-2]
+            print(img_name)
+            
+            #new ext 
+            ext = urls[1].split(".")[-1]
+            
+           
+    
+            #need to update this to identify image type for extension before loop
+            #is currently just sampling the urls list
+            for i in range(len(urls)):
+                
+                #I couldnt see this being created anywhere so did it here
+                filename = "adimages/{img_name}"
+                os.makedirs(os.path.dirname(filename), exist_ok = True)
+                
+                with open(f'adimages/{img_name}/img_{i}_{len(urls)}.{ext}', 'wb') as f:
+                #concantenating image sub_url to new base url
+                    sub_url = new_url + urls[i]
+                    r = requests.get(sub_url)
+                    f.write(r.content)
+                    
+            return img_name
+           
+        else:
+            
+            print("Probs not an image")	    
+            return None
 
 def getImages():
 	queryString = "* from aus_ads where Ad_Type='Image' AND image_type IS NOT NULL AND image_name IS NULL"
diff --git a/ocrImages.py b/ocrImages.py
index 086b3fa..9e2ea6e 100644
--- a/ocrImages.py
+++ b/ocrImages.py
@@ -5,50 +5,100 @@
 import time
 from PIL import Image
 
-def readImage(fileName):
-	print("Converting {fileName} to greyscale".format(fileName=fileName))
-	filePath = "adimages/" + fileName
-
-	if ".gif" in fileName:
-		text = ""
-		img = Image.open(filePath)
-		for frame in range(0,img.n_frames):
-			img.seek(frame)
-			imgrgb = img.convert('RGBA')
-			# imgrgb.show()
-			text = text + pytesseract.image_to_string(imgrgb)
-			print(text)
-		return text	
-	else:	
-		image = cv2.imread(filePath)
-		gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
-		ext = fileName.split(".")[1]
-		tempFile = "temp." + ext
-		cv2.imwrite(tempFile, gray)
-		text = pytesseract.image_to_string(Image.open(tempFile))
-		os.remove(tempFile)
-		print(text)
-		return text
 
+#the html part of this function is not picking up the text in the png images
+#I think may be because they are transparent, need to change background with cv2 
+#otherwise everything should work
+def readImage(fileName, val = None):
+    
+    print("Converting {fileName} to greyscale".format(fileName=fileName))
+    filePath = "adimages/" + fileName
+
+    
+    def process_img(filePath):
+        image = cv2.imread(filePath)
+        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
+        ext = filePath.split(".")[1]
+        tempFile = "temp." + ext
+        cv2.imwrite(tempFile, gray)
+        text = pytesseract.image_to_string(Image.open(tempFile))
+        os.remove(tempFile)
+        print(text)
+        return text
+    
+    if ".gif" in fileName:
+        text = ""
+        img = Image.open(filePath)
+        
+        for frame in range(0,img.n_frames):
+            
+            img.seek(frame)
+            imgrgb = img.convert('RGBA')
+            # imgrgb.show()
+            text = text + pytesseract.image_to_string(imgrgb)
+            
+            print(text)
+        
+        return text    
+    
+    
+    elif val and val in "html":
+        
+        text = ""
+
+        for img in os.listdir(filePath):
+            print(img)
+            path_new = os.path.join(filePath, img)
+            text += process_img(path_new)
+            
+        return text
+        
+        
+    else: 
+        return process_img(filePath)
+
+
+#this can be merged with ocrImages, just seperated them for ease of construction
+def ocr_HTML(test):
+    queryString = "* from aus_ads where image_type='html' AND image_text is NULL"
+    queryResult = scraperwiki.sqlite.select(queryString)
+
+    for row in queryResult:
+        val = row["image_type"]
+        row['image_text'] = ""
+
+        if row['image_name'] != "":
+            print(row['image_name'])
+            row['image_text'] = readImage(row['image_name'], val)
+                # except Exception as e:
+                #     row['imageText'] = ""
+                #     print("error")
+                #     print(e)
+
+        
+        time.sleep(0.1)
+        if not test:        
+            scraperwiki.sqlite.save(unique_keys=["Ad_ID"], data=row, table_name="aus_ads")
+    
 def ocrImages(test):
-	
-	queryString = "* from aus_ads where image_type='image' AND image_text is NULL"
-	queryResult = scraperwiki.sqlite.select(queryString)
-
-	for row in queryResult:
-		row['image_text'] = ""
-
-		if row['image_name'] != "":
-				print(row['image_name'])
-				row['image_text'] = readImage(row['image_name'])
-				# except Exception as e:
-				# 	row['imageText'] = ""
-				# 	print("error")
-				# 	print(e)
-
-		time.sleep(0.1)
-		if not test:		
-			scraperwiki.sqlite.save(unique_keys=["Ad_ID"], data=row, table_name="aus_ads")
-			
+    
+    queryString = "* from aus_ads where image_type='image' AND image_text is NULL"
+    queryResult = scraperwiki.sqlite.select(queryString)
+
+    for row in queryResult:
+        row['image_text'] = ""
+
+        if row['image_name'] != "":
+                print(row['image_name'])
+                row['image_text'] = readImage(row['image_name'])
+                # except Exception as e:
+                #     row['imageText'] = ""
+                #     print("error")
+                #     print(e)
+
+        time.sleep(0.1)
+        if not test:        
+            scraperwiki.sqlite.save(unique_keys=["Ad_ID"], data=row, table_name="aus_ads")
+            
 
 ocrImages(False)
\ No newline at end of file
diff --git a/ocr_html.py b/ocr_html.py
new file mode 100644
index 0000000..2a6e018
--- /dev/null
+++ b/ocr_html.py
@@ -0,0 +1,96 @@
+# -*- coding: utf-8 -*-
+"""
+Created on Tue Apr 26 13:09:12 2022
+
+@author: everall
+"""
+import scraperwiki
+import sqlite3
+import cv2
+import pandas as pd
+import os
+import pytesseract
+import time
+import simplejson as json
+from PIL import Image
+#import urllib2
+import requests
+from  bs4 import BeautifulSoup
+
+
+os.chdir("C:\Users\everall\Documents\Python\Projects\google-ad-database-processing-scripts")
+
+con = sqlite3.connect("scraperwiki.sqlite")
+
+cur = con.cursor()
+df = pd.read_sql_query("SELECT * FROM aus_ads", con)
+
+ip.downloadImage("https://tpc.googlesyndication.com/sadbundle/$csp%3Der3$/3760334257784916408/index.html")
+
+r = requests.get("https://tpc.googlesyndication.com/sadbundle/$csp%3Der3$/3760334257784916408/index.html")
+page = BeautifulSoup(r.content)
+page.findAll("localUrl")
+
+
+
+
+
+
+https://transparencyreport.google.com/political-ads/advertiser/AR146135025295818752/creative/CR118339371345641472
+
+
+
+
+
+
+
+
+
+ip.getImage("https://transparencyreport.google.com/political-ads/advertiser/AR146135025295818752/creative/CR118339371345641472")
+ip.getImage("https://transparencyreport.google.com/political-ads/library/advertiser/AR133579976896151552/creative/CR208733795677896704")
+
+ad_url = "https://transparencyreport.google.com/political-ads/advertiser/AR146135025295818752/creative/CR118339371345641472"
+
+# url_split = ad_url.split("/")
+#     #can just introduce regex to find index
+# 	ar_id = url_split[6]
+# 	cr_id = url_split[8]
+# 	# print(ar_id, cr_id)
+# 	ad_api_url = f"https://transparencyreport.google.com/transparencyreport/api/v3/politicalads/creatives/details?entity_id={ar_id}&creative_id={cr_id}&hl=en"
+# 	print(ad_api_url)
+# 	ad_results = requests.get(ad_api_url)
+# 	results_text = ad_results.text.replace(")]}'","").strip()
+# 	ad_results_json = json.loads(results_text)
+
+def get_HTML(ad_url):
+   url_split = ad_url.split("/")
+   ar_id = url_split[5]
+   cr_id = url_split[7]
+	# print(ar_id, cr_id)
+   ad_api_url = f"https://transparencyreport.google.com/transparencyreport/api/v3/politicalads/creatives/details?entity_id={ar_id}&creative_id={cr_id}&hl=en"
+   ad_results = requests.get(ad_api_url)
+   results_text = ad_results.text.replace(")]}'","").strip()
+   ad_results_json = json.loads(results_text)
+   
+   return ad_results_json
+
+ad_results_json = get_HTML(ad_url)
+results[0][3][4][3][3]
+ad_url = ad_results_json[0][3][4][3][3]
+print(ad_url)
+ad_js = requests.get(ad_url).content
+page = BeautifulSoup(ad_js)
+page.findAll("img")
+
+
+
+
+
+
+
+
+
+
+
+
+