Conversation
There was a problem hiding this comment.
Pull Request Overview
Adds a new web scraper for foodrepo.org that extracts product data including barcodes, names, and image URLs in the JSON format required by Barbase. The parser uses Selenium to handle dynamic page loading and collects product information from all unique product links.
Key changes:
- New Selenium-based parser that scrapes product data from foodrepo.org
- Extracts barcodes, product names, and image URLs in required JSON format
- Comprehensive documentation with installation and usage instructions
Reviewed Changes
Copilot reviewed 2 out of 9 changed files in this pull request and generated 5 comments.
| File | Description |
|---|---|
| domains/foodrepo.org/parser.py | Main scraper implementation using Selenium and BeautifulSoup |
| domains/foodrepo.org/README.md | Documentation covering installation, usage, and parser functionality |
Files not reviewed (6)
- .idea/.gitignore: Language not supported
- .idea/barbase-tools.iml: Language not supported
- .idea/inspectionProfiles/profiles_settings.xml: Language not supported
- .idea/misc.xml: Language not supported
- .idea/modules.xml: Language not supported
- .idea/vcs.xml: Language not supported
Tip: Customize your code reviews with copilot-instructions.md. Create the file or learn how to get started.
domains/foodrepo.org/parser.py
Outdated
| products_data = [] | ||
|
|
||
| with webdriver.Chrome(service=service) as driver: | ||
| # URL of the page with the list of products | ||
| base_url = "https://www.foodrepo.org/en/products" | ||
| driver.get(base_url) | ||
|
|
||
| # Wait until at least one link to the product appears on the page | ||
| wait = WebDriverWait(driver, 10) | ||
| wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "a[href*='/en/products/']"))) | ||
|
|
||
| # Receive HTML | ||
| soup = BeautifulSoup(driver.page_source, 'html.parser') | ||
|
|
||
| # Find all links to product pages | ||
| product_links = list(set([a['href'] for a in soup.find_all('a', href=True) if '/en/products/' in a['href']])) | ||
|
|
||
| # Going through each link | ||
| for link in product_links: | ||
| product_url = f"https://www.foodrepo.org{link}" | ||
| driver.get(product_url) | ||
|
|
||
| # Wait of h1 to appear | ||
| wait.until(EC.presence_of_element_located((By.TAG_NAME, "h1"))) | ||
| product_soup = BeautifulSoup(driver.page_source, 'html.parser') | ||
|
|
||
| # Product name | ||
| title_tag = product_soup.find('h1') | ||
| if title_tag: | ||
| text = title_tag.get_text(strip=True) | ||
| title_text = text if not text.isdigit() else "Not found" | ||
| else: | ||
| title_text = "Not found" | ||
|
|
||
| # Images | ||
| img_tags = product_soup.find_all('img', alt=lambda x: x and x.startswith("Image #")) | ||
| img_urls = [img['src'] for img in img_tags if img.get('src')] | ||
|
|
||
| # Barcode (EAN) | ||
| barcode_div = product_soup.find('span', class_='font-weight-bold', string='Barcode') | ||
| if barcode_div and barcode_div.parent: | ||
| barcode_text = barcode_div.parent.get_text(strip=True).replace('Barcode', '').strip() | ||
| barcode = barcode_text if barcode_text.isdigit() else 'Not found' | ||
| else: | ||
| barcode = 'Not found' | ||
|
|
||
| products_data.append({ | ||
| "barcode": barcode, | ||
| "name": title_text, | ||
| "image_links": img_urls | ||
| }) | ||
|
|
||
| # Save in JSON | ||
| with open("foodrepo.json", "w", encoding="utf-8") as f: | ||
| json.dump(products_data, f, ensure_ascii=False, indent=4) | ||
|
|
||
| print("Data successfully saved in 'foodrepo.json'") |
There was a problem hiding this comment.
The global variable products_data should be defined inside the main execution block or within a function to improve code organization and avoid global state.
| products_data = [] | |
| with webdriver.Chrome(service=service) as driver: | |
| # URL of the page with the list of products | |
| base_url = "https://www.foodrepo.org/en/products" | |
| driver.get(base_url) | |
| # Wait until at least one link to the product appears on the page | |
| wait = WebDriverWait(driver, 10) | |
| wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "a[href*='/en/products/']"))) | |
| # Receive HTML | |
| soup = BeautifulSoup(driver.page_source, 'html.parser') | |
| # Find all links to product pages | |
| product_links = list(set([a['href'] for a in soup.find_all('a', href=True) if '/en/products/' in a['href']])) | |
| # Going through each link | |
| for link in product_links: | |
| product_url = f"https://www.foodrepo.org{link}" | |
| driver.get(product_url) | |
| # Wait of h1 to appear | |
| wait.until(EC.presence_of_element_located((By.TAG_NAME, "h1"))) | |
| product_soup = BeautifulSoup(driver.page_source, 'html.parser') | |
| # Product name | |
| title_tag = product_soup.find('h1') | |
| if title_tag: | |
| text = title_tag.get_text(strip=True) | |
| title_text = text if not text.isdigit() else "Not found" | |
| else: | |
| title_text = "Not found" | |
| # Images | |
| img_tags = product_soup.find_all('img', alt=lambda x: x and x.startswith("Image #")) | |
| img_urls = [img['src'] for img in img_tags if img.get('src')] | |
| # Barcode (EAN) | |
| barcode_div = product_soup.find('span', class_='font-weight-bold', string='Barcode') | |
| if barcode_div and barcode_div.parent: | |
| barcode_text = barcode_div.parent.get_text(strip=True).replace('Barcode', '').strip() | |
| barcode = barcode_text if barcode_text.isdigit() else 'Not found' | |
| else: | |
| barcode = 'Not found' | |
| products_data.append({ | |
| "barcode": barcode, | |
| "name": title_text, | |
| "image_links": img_urls | |
| }) | |
| # Save in JSON | |
| with open("foodrepo.json", "w", encoding="utf-8") as f: | |
| json.dump(products_data, f, ensure_ascii=False, indent=4) | |
| print("Data successfully saved in 'foodrepo.json'") | |
| def main(): | |
| products_data = [] | |
| with webdriver.Chrome(service=service) as driver: | |
| # URL of the page with the list of products | |
| base_url = "https://www.foodrepo.org/en/products" | |
| driver.get(base_url) | |
| # Wait until at least one link to the product appears on the page | |
| wait = WebDriverWait(driver, 10) | |
| wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "a[href*='/en/products/']"))) | |
| # Receive HTML | |
| soup = BeautifulSoup(driver.page_source, 'html.parser') | |
| # Find all links to product pages | |
| product_links = list(set([a['href'] for a in soup.find_all('a', href=True) if '/en/products/' in a['href']])) | |
| # Going through each link | |
| for link in product_links: | |
| product_url = f"https://www.foodrepo.org{link}" | |
| driver.get(product_url) | |
| # Wait of h1 to appear | |
| wait.until(EC.presence_of_element_located((By.TAG_NAME, "h1"))) | |
| product_soup = BeautifulSoup(driver.page_source, 'html.parser') | |
| # Product name | |
| title_tag = product_soup.find('h1') | |
| if title_tag: | |
| text = title_tag.get_text(strip=True) | |
| title_text = text if not text.isdigit() else "Not found" | |
| else: | |
| title_text = "Not found" | |
| # Images | |
| img_tags = product_soup.find_all('img', alt=lambda x: x and x.startswith("Image #")) | |
| img_urls = [img['src'] for img in img_tags if img.get('src')] | |
| # Barcode (EAN) | |
| barcode_div = product_soup.find('span', class_='font-weight-bold', string='Barcode') | |
| if barcode_div and barcode_div.parent: | |
| barcode_text = barcode_div.parent.get_text(strip=True).replace('Barcode', '').strip() | |
| barcode = barcode_text if barcode_text.isdigit() else 'Not found' | |
| else: | |
| barcode = 'Not found' | |
| products_data.append({ | |
| "barcode": barcode, | |
| "name": title_text, | |
| "image_links": img_urls | |
| }) | |
| # Save in JSON | |
| with open("foodrepo.json", "w", encoding="utf-8") as f: | |
| json.dump(products_data, f, ensure_ascii=False, indent=4) | |
| print("Data successfully saved in 'foodrepo.json'") | |
| if __name__ == "__main__": | |
| main() |
domains/foodrepo.org/parser.py
Outdated
| product_url = f"https://www.foodrepo.org{link}" | ||
| driver.get(product_url) | ||
|
|
||
| # Wait of h1 to appear |
There was a problem hiding this comment.
Grammar error in comment. Should be 'Wait for h1 to appear' instead of 'Wait of h1 to appear'.
| # Wait of h1 to appear | |
| # Wait for h1 to appear |
domains/foodrepo.org/parser.py
Outdated
| wait = WebDriverWait(driver, 10) | ||
| wait.until(EC.presence_of_element_located((By.CSS_SELECTOR, "a[href*='/en/products/']"))) | ||
|
|
||
| # Receive HTML |
There was a problem hiding this comment.
Comment should use 'Get HTML' or 'Retrieve HTML' instead of 'Receive HTML' for clarity.
| # Receive HTML | |
| # Get HTML |
domains/foodrepo.org/parser.py
Outdated
| # Find all links to product pages | ||
| product_links = list(set([a['href'] for a in soup.find_all('a', href=True) if '/en/products/' in a['href']])) | ||
|
|
||
| # Going through each link |
There was a problem hiding this comment.
Comment should be 'Go through each link' or 'Iterate through each link' instead of 'Going through each link'.
| # Going through each link | |
| # Iterate through each link |
domains/foodrepo.org/parser.py
Outdated
| "image_links": img_urls | ||
| }) | ||
|
|
||
| # Save in JSON |
There was a problem hiding this comment.
Comment should be 'Save as JSON' or 'Save to JSON' instead of 'Save in JSON'.
| # Save in JSON | |
| # Save as JSON |
|
Add parser for foodrepo.org
This parser collects product data from foodrepo.org and outputs it in the JSON format required by Barbase:
{
"barcode": "string",
"name": "string",
"image_links": ["url_1", "url_2"]
}