diff --git a/CHANGELOG.md b/CHANGELOG.md
index f04c36b..23d133e 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,6 +1,14 @@
# PyMeta Change Log
All notable changes to this project will be documented in this file.
+## [1.3.0]
+### Added
+* Google Custom Search API support with `--api-key` and `--search-engine-id` flags
+
+### Changed
+* Updated installation documentation to recommend pipx over pip
+* Added direct GitHub installation instructions
+
## [1.2.0]
### Added
* Users can now limit or specify searched file types
diff --git a/README.md b/README.md
index 793fb4e..f161f81 100644
--- a/README.md
+++ b/README.md
@@ -11,6 +11,8 @@ PyMeta is a Python3 rewrite of the tool [PowerMeta](https://github.com/dafthack/
Once downloaded, metadata is extracted from these files using Phil Harvey's [exiftool](https://sno.phy.queensu.ca/~phil/exiftool/) and added to a ```.csv``` report. Alternatively, Pymeta can be pointed at a directory to extract metadata from files manually downloaded using the ```-dir``` command line argument. See the [Usage](#Usage), or [All Options](#All-Options) section for more information.
+> **Note:** Due to Google's increasingly aggressive anti-bot measures, web scraping may yield limited results. For improved reliability and more consistent results, consider using the Google Custom Search API with the `--api-key` and `--search-engine-id` flags.
+
#### Why?
Metadata is a common place for penetration testers and red teamers to find: domains, user accounts, naming conventions, software/version numbers, and more!
@@ -24,25 +26,100 @@ Metadata is a common place for penetration testers and red teamers to find: doma
**Mac OS** - ```brew install exiftool```
### Install:
-Install the last stable release from PyPi:
+**Recommended: Install with pipx (preferred method)**
+
+Using [pipx](https://pypa.github.io/pipx/) is the preferred installation method as it installs PyMeta in an isolated environment, preventing conflicts with system Python packages:
+
```commandline
-pip3 install pymetasec
+pipx install pymetasec
```
-Or, install the most recent code from GitHub:
+If you don't have pipx installed:
+```commandline
+# Ubuntu/Debian
+sudo apt install pipx
+pipx ensurepath
+
+# Mac OS
+brew install pipx
+pipx ensurepath
+
+# Or via pip (then restart your shell)
+python3 -m pip install --user pipx
+python3 -m pipx ensurepath
```
+
+**Alternative: Install with pip**
+
+You can also install directly with pip, though this may affect system Python packages:
+```commandline
+pip3 install pymetasec
+```
+
+**Install from source:**
+
+Clone the repository and install locally:
+```bash
git clone https://github.com/m8sec/pymeta
cd pymeta
-python3 setup.py install
+pipx install .
+# Or with pip: pip3 install .
+```
+
+Or install directly from GitHub without cloning:
+```bash
+pipx install git+https://github.com/m8sec/pymeta
+# Or with pip: pip3 install git+https://github.com/m8sec/pymeta
```
## Usage
+
+### Standard Search (Web Scraping)
* Search Google and Bing for files within example.com and extract metadata to a csv report:
```pymeta -d example.com```
* Extract metadata from files within the given directory and create csv report:
```pymeta -dir Downloads/```
+### Google API Search
+Due to Google's aggressive anti-bot protections, web scraping may produce limited results. For better reliability, use the Google API option:
+
+```pymeta -d example.com --api-key "your_api_key_here" --search-engine-id "your_search_engine_id"```
+
+#### Setting up Google API
+
+**Step 1: Create Google Cloud Project**
+- Go to [Google Cloud Console](https://cloud.google.com/)
+- Login with a Google account
+- Click "Select a project" → "New Project"
+- Enter a project name (e.g., "PyMeta-API")
+- Click "Create"
+
+**Step 2: Enable Custom Search API**
+- In your project, go to "APIs & Services" → "Library"
+- Search for "Custom Search API"
+- Click on it and press "Enable"
+
+**Step 3: Create API Key**
+- Go to "APIs & Services" → "Credentials"
+- Click "Create Credentials" → "API Key"
+- Copy your API key (you'll need this for the `--api-key` flag)
+
+**Step 4: Create Custom Search Engine**
+- Go to [Google Programmable Search Engine](https://programmablesearchengine.google.com/)
+- Click "Add a search engine"
+- Enter any name (e.g., "PyMeta Search")
+- For "Sites to search", select "Search the entire web"
+- Click "Create"
+- Copy your Search Engine ID (you'll need this for the `--search-engine-id` flag)
+
+**API Usage Notes:**
+- Google provides 100 free API calls per day
+- Additional requests cost $5 per 1000 queries
+- API searches are more reliable than web scraping and less likely to be blocked
+- When using API mode, only Google search is used (Bing searches are disabled)
+
+> **NOTE**: Thanks to Beau Bullock [(@dafthack)](https://twitter.com/dafthack) and the [PowerMeta](https://github.com/dafthack/PowerMeta) project for the above steps on getting a Google API key.
## All Options
```
@@ -57,6 +134,10 @@ Search Options:
--file-type FILE_TYPE File types to search (default=pdf,xls,xlsx,csv,doc,docx,ppt,pptx)
-m MAX_RESULTS Max results per type search
+Google API Options:
+ --api-key API_KEY Google API key for Custom Search API
+ --search-engine-id ID Google Custom Search Engine ID
+
Proxy Options:
--proxy PROXY Proxy requests (IP:Port)
--proxy-file PROXY Load proxies from file for rotation
diff --git a/pymeta/__init__.py b/pymeta/__init__.py
index 461aa3a..1c8e97d 100644
--- a/pymeta/__init__.py
+++ b/pymeta/__init__.py
@@ -11,12 +11,12 @@
from pymeta import utils
from pymeta.logger import *
from subprocess import getoutput
-from pymeta.search import PyMeta, download_file
+from pymeta.search import PyMeta, PyMetaAPI, download_file
def status(args):
- VERSION = 'v1.2.0'
+ VERSION = 'v1.3.0'
print("\nPyMeta {} - {}\n".format(VERSION, highlight("by @m8sec", "bold", "gray")))
@@ -24,7 +24,13 @@ def status(args):
return
Log.info("Target Domain : {}".format(highlight(args.domain if args.domain else args.file_dir, "bold", "gray"), ))
- Log.info("Search Engines(s) : {}".format(highlight(', '.join(args.engine), "bold", "gray")))
+
+ # Show API mode status
+ if args.api_key and args.search_engine_id:
+ Log.info("Search Mode : {}".format(highlight("Google Custom Search API", "bold", "green")))
+ else:
+ Log.info("Search Engines(s) : {}".format(highlight(', '.join(args.engine), "bold", "gray")))
+
Log.info("File Types(s) : {}".format(highlight(', '.join(args.file_type), "bold", "gray"), ))
Log.info("Max Downloads : {}\n".format(highlight(args.max_results, "bold", "gray")))
@@ -41,6 +47,10 @@ def cli():
search.add_argument('--file-type', default='pdf,xls,xlsx,csv,doc,docx,ppt,pptx', type=lambda x: utils.delimiter2list(x), help='File types to search')
search.add_argument('-m', dest='max_results', type=int, default=50, help='Max results per type search')
+ api = args.add_argument_group("Google API Options")
+ api.add_argument('--api-key', dest='api_key', type=str, default=None, help='Google API key for Custom Search API')
+ api.add_argument('--search-engine-id', dest='search_engine_id', type=str, default=None, help='Google Custom Search Engine ID')
+
p = args.add_argument_group("Proxy Options")
pr = p.add_mutually_exclusive_group(required=False)
pr.add_argument('--proxy', dest='proxy', action='append', default=[], help='Proxy requests (IP:Port)')
@@ -59,14 +69,25 @@ def cli():
def start_scrape(args):
tmp = []
- Log.info('Searching {} for {} file type(s) on "{}"'.format(', '.join(args.engine), len(args.file_type), args.domain))
-
- for file_type in args.file_type:
- for search_engine in args.engine:
- pym = PyMeta(search_engine, args.domain, file_type, args.timeout, 3, args.proxy, args.jitter, args.max_results)
- if search_engine in pym.url.keys():
- tmp += pym.search()
- tmp = list(set(tmp))
+
+ # Check if API credentials are provided
+ if args.api_key and args.search_engine_id:
+ Log.info('Using Google Custom Search API for {} file type(s) on "{}"'.format(len(args.file_type), args.domain))
+
+ for file_type in args.file_type:
+ pym_api = PyMetaAPI(args.api_key, args.search_engine_id, args.domain, file_type, args.max_results)
+ tmp += pym_api.search()
+ tmp = list(set(tmp))
+ else:
+ # Fall back to web scraping
+ Log.info('Searching {} for {} file type(s) on "{}"'.format(', '.join(args.engine), len(args.file_type), args.domain))
+
+ for file_type in args.file_type:
+ for search_engine in args.engine:
+ pym = PyMeta(search_engine, args.domain, file_type, args.timeout, 3, args.proxy, args.jitter, args.max_results)
+ if search_engine in pym.url.keys():
+ tmp += pym.search()
+ tmp = list(set(tmp))
dwnld_dir = download_results(args, tmp)
extract_exif(dwnld_dir, args.report_file, tmp)
@@ -131,6 +152,12 @@ def extract_exif(file_dir, output_file, urls=[]):
def main():
try:
args = cli()
+
+ # Validate API arguments
+ if (args.api_key and not args.search_engine_id) or (not args.api_key and args.search_engine_id):
+ Log.fail("Both --api-key and --search-engine-id must be provided together for API mode")
+ exit(1)
+
status(args)
exif.exif_check()
diff --git a/pymeta/search.py b/pymeta/search.py
index 06d0db0..751e026 100644
--- a/pymeta/search.py
+++ b/pymeta/search.py
@@ -106,6 +106,95 @@ def results_handler(self, link):
logging.debug('Added URL: {}'.format(url))
+class PyMetaAPI:
+ """Google Custom Search API implementation for PyMeta"""
+
+ def __init__(self, api_key, search_engine_id, target, file_type, max_results=50):
+ self.api_key = api_key
+ self.search_engine_id = search_engine_id
+ self.target = target
+ self.file_type = file_type.lower()
+ self.max_results = max_results
+ self.results = []
+ self.api_url = "https://www.googleapis.com/customsearch/v1"
+
+ def search(self):
+ """Search for files using Google Custom Search API"""
+ start_index = 1
+
+ while len(self.results) < self.max_results:
+ try:
+ # Google API has a limit of 100 results
+ if start_index > 100:
+ Log.info('Reached Google API limit of 100 results')
+ break
+
+ # Build the API request
+ params = {
+ 'key': self.api_key,
+ 'cx': self.search_engine_id,
+ 'q': f'site:{self.target} filetype:{self.file_type}',
+ 'start': start_index
+ }
+
+ logging.debug(f'API Request: {self.api_url} with start={start_index}')
+ response = requests.get(self.api_url, params=params, timeout=10)
+
+ # Check for API errors
+ if response.status_code == 429:
+ Log.warn('API quota limit exceeded! Daily limit reached.')
+ break
+ elif response.status_code != 200:
+ Log.warn(f'API request failed with status code: {response.status_code}')
+ break
+
+ data = response.json()
+
+ # Check for errors in response
+ if 'error' in data:
+ error_msg = data['error'].get('message', 'Unknown error')
+ Log.warn(f'API Error: {error_msg}')
+ break
+
+ # Extract file URLs from results
+ if 'items' in data:
+ for item in data['items']:
+ url = item.get('link', '')
+ if url and self.is_valid_file_url(url):
+ self.results.append(url)
+ logging.debug(f'Added URL: {url}')
+
+ Log.info(f"{len(self.results):<3} | {self.file_type:<4} - API search (status: {response.status_code})")
+ else:
+ # No more results
+ logging.debug('No items in API response')
+ break
+
+ # Check if there are more pages
+ if 'queries' not in data or 'nextPage' not in data['queries']:
+ logging.debug('No more pages available')
+ break
+
+ # Move to next page (Google returns 10 results per page)
+ start_index += 10
+
+ # Brief delay between API calls
+ sleep(0.5)
+
+ except requests.exceptions.RequestException as e:
+ Log.warn(f'API request error: {e}')
+ break
+ except Exception as e:
+ Log.warn(f'Unexpected error during API search: {e}')
+ break
+
+ return self.results
+
+ def is_valid_file_url(self, url):
+ """Check if URL points to the target file type"""
+ return url.lower().endswith(f'.{self.file_type}')
+
+
def get_statuscode(resp):
try:
return resp.status_code
diff --git a/pyproject.toml b/pyproject.toml
new file mode 100644
index 0000000..017a99a
--- /dev/null
+++ b/pyproject.toml
@@ -0,0 +1,41 @@
+[project]
+name = "pymetasec"
+version = "1.3.0"
+description = "PyMeta is a Python3 rewrite of PowerMeta for metadata extraction from files found via Google and Bing searches."
+authors = [
+ { name = "m8sec" }
+]
+readme = "README.md"
+license = {text = "GPL-3.0-or-later"}
+requires-python = ">=3.6"
+classifiers = [
+ "Environment :: Console",
+ "Programming Language :: Python :: 3",
+ "License :: OSI Approved :: GNU General Public License v3 or later (GPLv3+)",
+ "Topic :: Security"
+]
+dependencies = [
+ "requests>=2.28.1",
+ "bs4>=0.0.1",
+ "tldextract>=3.0.0",
+ "lxml>=4.9.0"
+]
+
+[project.urls]
+Homepage = "https://github.com/m8sec/pymeta"
+
+[tool.setuptools.packages.find]
+where = ["."]
+include = ["pymeta*"]
+exclude = ["tests*"]
+
+[tool.setuptools.package-data]
+pymeta = ["resources/*"]
+
+[project.scripts]
+pymeta = "pymeta:main"
+pymetasec = "pymeta:main"
+
+[build-system]
+requires = ["setuptools", "wheel"]
+build-backend = "setuptools.build_meta"
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index fbf2cc4..05ec0d1 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,13 +1,4 @@
-beautifulsoup4>=4.11.1
-bs4>=0.0.1
-certifi>=2022.9.14
-charset-normalizer>=2.1.1
-filelock>=3.8.0
-idna>=3.4
-lxml>=4.9.2
requests>=2.28.1
-requests-file>=1.5.1
-six>=1.16.0
-soupsieve>=2.3.2.post1
-tldextract>=3.3.1
-urllib3>=1.26.12
+bs4>=0.0.1
+tldextract>=3.0.0
+lxml>=4.9.0
diff --git a/setup.py b/setup.py
index 2311628..9221fba 100644
--- a/setup.py
+++ b/setup.py
@@ -5,7 +5,7 @@
setup(
name='pymetasec',
- version='1.2.0',
+ version='1.3.0',
author= 'm8sec',
long_description=long_description,
long_description_content_type="text/markdown",
@@ -16,7 +16,9 @@
package_data={'pymeta': ['resources/*']},
install_requires=[
'requests>=2.28.1',
- 'bs4>=0.0.1'
+ 'bs4>=0.0.1',
+ 'tldextract>=3.0.0',
+ 'lxml>=4.9.0'
],
classifiers = [
"Environment :: Console",