Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
repos:
- repo: https://github.com/pre-commit/pre-commit-hooks
rev: v5.0.0
hooks:
- id: trailing-whitespace
- id: end-of-file-fixer
- id: check-yaml
- id: check-merge-conflict
- id: check-added-large-files
- id: check-ast # Python syntax check
- id: debug-statements # No debugger/breakpoint statements
- id: check-docstring-first # Docstring before code
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ run_script:

clean:
find . -type d -name "__pycache__" -exec rm -r {} +
rm -rf build dist *.egg-info result.json
rm -rf build dist *.egg-info

help:
@echo "Makefile targets:"
Expand Down
10 changes: 5 additions & 5 deletions setup.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
from setuptools import setup, find_packages
from setuptools import find_packages, setup

setup(
name='cli_parser',
version='0.1.0',
name="cli_parser",
version="0.1.0",
packages=find_packages(),
install_requires=[],
entry_points={
'console_scripts': [
'cli_parser=src.parser:main',
"console_scripts": [
"cli_parser=src.parser:main",
],
},
)
342 changes: 342 additions & 0 deletions src/binary_finder.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,342 @@
#!/usr/bin/env python3
"""
Binary Finder Module - Locate binaries in Docker containers and host systems

Features:
- Efficient single-pass filesystem search using find command
- Substring matching for tool name discovery
- Caching to avoid repeated scans
- 2-minute timeout for large container images
"""
import os
import re
import subprocess
from typing import Optional, List, Tuple, Set
from pathlib import Path


class BinaryFinder:
"""Find binaries in Docker containers and host systems"""

# Cache for container scans to avoid repeated searches
_container_cache = {}

# Timeout for container operations (2 minutes)
CONTAINER_TIMEOUT = 120

@staticmethod
def find_on_host(binary_name: str) -> Tuple[Optional[str], str]:
"""
Find binary on host system

Args:
binary_name: Name of binary to find

Returns:
Tuple of (binary_path, discovery_method) or (None, "not_found")
"""
# Try which command first (fastest)
try:
result = subprocess.run(
['which', binary_name],
capture_output=True,
text=True,
timeout=5
)
if result.returncode == 0 and result.stdout.strip():
return result.stdout.strip(), "which"
except (subprocess.TimeoutExpired, FileNotFoundError):
pass

# Try common locations
common_paths = [
'/usr/bin',
'/usr/local/bin',
'/bin',
'/opt/bin',
os.path.expanduser('~/.local/bin')
]

for path in common_paths:
binary_path = Path(path) / binary_name
if binary_path.exists() and os.access(binary_path, os.X_OK):
return str(binary_path), "common_path"

return None, "not_found"

@staticmethod
def find_in_container(docker_image: str, binary_name: str) -> Tuple[Optional[str], str]:
"""
Find binary in Docker container

Args:
docker_image: Full Docker image name (e.g., "alpine:latest")
binary_name: Name of binary to find

Returns:
Tuple of (binary_path, discovery_method) or (None, "not_found")
"""
# Try which command in container first (fastest)
try:
result = subprocess.run(
['docker', 'run', '--rm', '--entrypoint', 'which', docker_image, binary_name],
capture_output=True,
text=True,
timeout=30
)
if result.returncode == 0 and result.stdout.strip():
return result.stdout.strip(), "which_in_container"
except subprocess.TimeoutExpired:
pass

# Try command -v (works in more minimal containers)
try:
result = subprocess.run(
['docker', 'run', '--rm', '--entrypoint', 'sh', docker_image,
'-c', f'command -v {binary_name}'],
capture_output=True,
text=True,
timeout=30
)
if result.returncode == 0 and result.stdout.strip():
return result.stdout.strip(), "command_v"
except subprocess.TimeoutExpired:
pass
Comment on lines 92 to 104
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🔴 Critical

🧩 Analysis chain

🏁 Script executed:

# First, locate and examine the binary_finder.py file
find . -name "binary_finder.py" -type f

Repository: O1ahmad/cli-parser

Length of output: 84


🏁 Script executed:

# Check the content around lines 92-104 to see the exact code
if [ -f "src/binary_finder.py" ]; then
  cat -n src/binary_finder.py | sed -n '85,110p'
fi

Repository: O1ahmad/cli-parser

Length of output: 1180


🏁 Script executed:

# Look for where binary_name comes from and if there's any validation
if [ -f "src/binary_finder.py" ]; then
  rg -n "binary_name" src/binary_finder.py -B 2 -A 2
fi

Repository: O1ahmad/cli-parser

Length of output: 1769


🏁 Script executed:

# Check function signature and parameter origin to understand if binary_name is user-controlled
if [ -f "src/binary_finder.py" ]; then
  cat -n src/binary_finder.py | sed -n '1,95p'
fi

Repository: O1ahmad/cli-parser

Length of output: 3773


🏁 Script executed:

# Find where find_in_container is called
rg -n "find_in_container" src/binary_finder.py -B 2 -A 2

Repository: O1ahmad/cli-parser

Length of output: 604


🏁 Script executed:

# Check for other uses of sh -c in the file
rg -n "sh.*-c" src/binary_finder.py -B 2 -A 2

Repository: O1ahmad/cli-parser

Length of output: 44


🏁 Script executed:

# Look for any validation/sanitization of binary_name
rg -n "binary_name" src/binary_finder.py | head -20

Repository: O1ahmad/cli-parser

Length of output: 638


🏁 Script executed:

# Check for callers of find_in_container outside this file
rg -n "find_in_container" --type py

Repository: O1ahmad/cli-parser

Length of output: 402


🏁 Script executed:

# Check if binary_name is validated anywhere before being passed
if [ -f "src/binary_finder.py" ]; then
  cat -n src/binary_finder.py | sed -n '100,210p'
fi

Repository: O1ahmad/cli-parser

Length of output: 5025


🏁 Script executed:

# Check parser_v2.py to understand the other call site
rg -n "find_in_container" src/parser_v2.py -B 5 -A 5

Repository: O1ahmad/cli-parser

Length of output: 503


🏁 Script executed:

# Verify the syntax of the proposed fix is valid
python3 << 'EOF'
import subprocess

# Test that the proposed syntax is valid (without executing)
# When using a list with subprocess.run, shell=False (the default)
# The arguments are passed directly without shell interpretation
proposed_command = [
    'echo', 'test',  # Simulated docker run command
    '-c', 'command -v -- "$1"', 'sh', 'test-binary'
]

# This demonstrates the list structure is valid
print("Proposed command structure (as list):")
print(proposed_command)
print("\nWhen passed to subprocess.run with list (shell=False):")
print("- Arguments 0-2: echo test -c")
print("- Argument 3: 'command -v -- \"$1\"' (literal string, no shell interpretation)")
print("- Argument 4: 'sh'")
print("- Argument 5: 'test-binary' (passed as $1 to sh, no interpolation)")
print("\nConclusion: The proposed syntax is correct for safe parameter passing")
EOF

Repository: O1ahmad/cli-parser

Length of output: 148


Fix shell injection vulnerability in container command check.

The f'command -v {binary_name}' string passed to sh -c allows command injection. If binary_name contains shell metacharacters (e.g., test; rm -rf / or $(malicious)), arbitrary commands execute in the container. Pass the name as a positional parameter instead.

🔒 Safer parameter passing
-            result = subprocess.run(
-                ['docker', 'run', '--rm', '--entrypoint', 'sh', docker_image, 
-                 '-c', f'command -v {binary_name}'],
+            result = subprocess.run(
+                ['docker', 'run', '--rm', '--entrypoint', 'sh', docker_image,
+                 '-c', 'command -v -- "$1"', 'sh', binary_name],
                 capture_output=True,
                 text=True,
                 timeout=30
             )
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
# Try command -v (works in more minimal containers)
try:
result = subprocess.run(
['docker', 'run', '--rm', '--entrypoint', 'sh', docker_image,
'-c', f'command -v {binary_name}'],
capture_output=True,
text=True,
timeout=30
)
if result.returncode == 0 and result.stdout.strip():
return result.stdout.strip(), "command_v"
except subprocess.TimeoutExpired:
pass
# Try command -v (works in more minimal containers)
try:
result = subprocess.run(
['docker', 'run', '--rm', '--entrypoint', 'sh', docker_image,
'-c', 'command -v -- "$1"', 'sh', binary_name],
capture_output=True,
text=True,
timeout=30
)
if result.returncode == 0 and result.stdout.strip():
return result.stdout.strip(), "command_v"
except subprocess.TimeoutExpired:
pass
🧰 Tools
🪛 Ruff (0.14.13)

94-94: subprocess call: check for execution of untrusted input

(S603)


95-96: Starting a process with a partial executable path

(S607)

🤖 Prompt for AI Agents
In `@src/binary_finder.py` around lines 92 - 104, The current subprocess.run
builds a shell command with f'command -v {binary_name}', which allows shell
injection; update the subprocess.run invocation to pass the binary name as a
positional parameter to sh instead of interpolating it: replace the f-string
with a constant command like 'command -v "$1"' (or similar) and add the
binary_name as a separate argument after the image (e.g., using '--' then
binary_name) so the container's sh receives the binary name as $1; keep the same
subprocess.run options (capture_output, text, timeout) and preserve the existing
return check (result.returncode == 0 and result.stdout.strip()) and the returned
tag "command_v".


return None, "not_found"

@staticmethod
def discover_all_executables(docker_image: str, use_cache: bool = True) -> List[str]:
"""
Discover all executable files in a Docker container using efficient find command

Args:
docker_image: Full Docker image name
use_cache: Whether to use cached results

Returns:
List of executable file paths found in container
"""
# Check cache first
if use_cache and docker_image in BinaryFinder._container_cache:
return BinaryFinder._container_cache[docker_image]

print(f" → Scanning container filesystem for executables (timeout: {BinaryFinder.CONTAINER_TIMEOUT}s)...")

# Efficient find command from root, excluding pseudo-filesystems
find_cmd = [
'docker', 'run', '--rm', '--entrypoint', 'sh', docker_image,
'-c',
'find / -type f -executable '
'-not -path "/proc/*" '
'-not -path "/sys/*" '
'-not -path "/dev/*" '
'-not -path "/var/*" '
'-not -path "*/.git/*" '
'2>/dev/null || true'
]

try:
result = subprocess.run(
find_cmd,
capture_output=True,
text=True,
timeout=BinaryFinder.CONTAINER_TIMEOUT
)

if result.returncode in [0, 1]: # 0 = success, 1 = some files not found (OK)
executables = [line.strip() for line in result.stdout.split('\n') if line.strip()]
print(f" → Found {len(executables)} executables")

# Cache results
BinaryFinder._container_cache[docker_image] = executables
return executables
else:
print(f" → Find command failed with exit code {result.returncode}")
return []

except subprocess.TimeoutExpired:
print(f" → Timeout after {BinaryFinder.CONTAINER_TIMEOUT}s (large image or slow filesystem)")
return []
except Exception as e:
print(f" → Error scanning container: {e}")
return []

@staticmethod
def generate_candidates(tool_name: str, min_length: int = 2) -> Set[str]:
"""
Generate candidate binary names by simply splitting tool name into words

Args:
tool_name: Tool name from config (e.g., "Apache Drill" or "kubectl")
min_length: Minimum word length to consider (default 2 for short tools like "go", "jq")

Returns:
Set of words to search for in binary names
"""
# Normalize and split into words
normalized = tool_name.lower()
words = re.split(r'[\s\-_]+', normalized)

# Remove common prefix words that aren't part of binary names
ignore_words = {'the', 'a', 'an', 'apache', 'project', 'foundation'}
words = [w for w in words if w and w not in ignore_words and len(w) >= min_length]

return set(words)

@staticmethod
def verify_executable_responds_to_help(binary_path: str, docker_image: str) -> bool:
"""
Quick check if binary responds to basic help commands

Args:
binary_path: Path to binary
docker_image: Docker image to test in

Returns:
True if binary responds to --help, -h, or help
"""
import subprocess

# Try quick help variations (2 second timeout each)
help_variations = ['--help', '-h', 'help']

for help_arg in help_variations:
try:
# Use --entrypoint to explicitly specify the binary we want to test
result = subprocess.run(
['docker', 'run', '--rm', '--entrypoint', binary_path, docker_image, help_arg],
capture_output=True,
text=True,
timeout=2
)

output = (result.stdout + result.stderr).strip()

# Check if we got any meaningful output
if len(output) > 50 and result.returncode in [0, 1]:
return True

except (subprocess.TimeoutExpired, Exception):
continue

return False

@staticmethod
def match_executables_to_candidates(
executables: List[str],
candidates: Set[str],
docker_image: Optional[str] = None,
verify_help: bool = False
) -> List[Tuple[str, str, float]]:
"""
Match executables that contain any of the candidate words

Args:
executables: List of full paths to executables
candidates: Set of words to search for
docker_image: Optional Docker image for verification
verify_help: If True, verify executable responds to help commands

Returns:
List of (exe_path, match_type, confidence) tuples, sorted by match quality
"""
matches = []

# Skip common system binaries and script files
skip_binaries = {'sh', 'bash', 'ls', 'cat', 'echo', 'true', 'false', 'test', 'id', 'tr', 'ar', 'as'}
skip_extensions = {'.js', '.ts', '.d.ts', '.json', '.py', '.rb', '.pl', '.sh', '.txt', '.md', '.xml', '.html'}

for exe_path in executables:
exe_name = os.path.basename(exe_path)
exe_name_lower = exe_name.lower()

# Skip system binaries and scripts
if exe_name in skip_binaries:
continue
if any(exe_name.endswith(ext) for ext in skip_extensions):
continue

# Check if any candidate word is in the executable name (case-insensitive)
for word in candidates:
if word in exe_name_lower:
# Simple confidence based on match quality
if exe_name_lower == word:
confidence = 1.0
match_type = 'exact'
elif exe_name_lower.startswith(word):
confidence = 0.9
match_type = 'starts_with'
else:
confidence = 0.7
match_type = 'contains'

# Bonus for binaries in standard locations
if '/usr/bin/' in exe_path or '/usr/local/bin/' in exe_path:
confidence = min(confidence + 0.1, 1.0)

# Optional: Verify it responds to help (quick check)
if verify_help and docker_image:
if not BinaryFinder.verify_executable_responds_to_help(exe_path, docker_image):
# Penalize if it doesn't respond to help
confidence *= 0.5

matches.append((exe_path, match_type, confidence))
break # Only count each executable once

# Sort by confidence (highest first), then by path length (shorter preferred)
matches.sort(key=lambda x: (-x[2], len(x[0])))

return matches

@staticmethod
def discover_binaries_for_tool(docker_image: str, tool_name: str) -> List[Tuple[str, str, float]]:
"""
Main discovery method - find all matching binaries for a tool

Args:
docker_image: Docker image to search in
tool_name: Name of the tool

Returns:
List of (binary_path, match_type, confidence) tuples
"""
print(f"\n Discovering binaries for: {tool_name}")
print(f" Image: {docker_image}")

# Generate candidates
candidates = BinaryFinder.generate_candidates(tool_name)
print(f" → Generated {len(candidates)} candidates: {', '.join(sorted(candidates)[:10])}{'...' if len(candidates) > 10 else ''}")

# Try quick direct lookup first
for candidate in sorted(candidates, key=len, reverse=True)[:5]: # Try top 5 most likely
binary_path, method = BinaryFinder.find_in_container(docker_image, candidate)
if binary_path:
print(f" → Quick match found: {binary_path} (method: {method})")
return [(binary_path, method, 1.0)]

# Fall back to full filesystem scan
print(f" → Quick lookup failed, performing full scan...")
executables = BinaryFinder.discover_all_executables(docker_image)

if not executables:
print(f" → No executables found in container")
return []

# Match executables to candidates (with help verification)
matches = BinaryFinder.match_executables_to_candidates(
executables,
candidates,
docker_image=docker_image,
verify_help=True # Enable help verification
)

if matches:
print(f" → Found {len(matches)} matches")
# Show top 5 matches
for exe_path, match_type, confidence in matches[:5]:
print(f" • {os.path.basename(exe_path)} ({match_type}, confidence: {confidence:.2f})")
else:
print(f" → No matches found")

return matches
Loading