diff --git a/facebook_analyzer/__init__.py b/facebook_analyzer/__init__.py deleted file mode 100644 index c4c716a..0000000 --- a/facebook_analyzer/__init__.py +++ /dev/null @@ -1,8 +0,0 @@ -# This file makes the 'facebook_analyzer' directory a Python package. -# You can leave it empty or add package-level imports here if needed later. - -# For example, you might want to make functions from modules directly available: -# from .phishing_detector import analyze_message_for_phishing -# from .fake_profile_detector import analyze_profile_for_fakeness - -# For now, keeping it simple. User will import specific modules. diff --git a/facebook_analyzer/__pycache__/__init__.cpython-312.pyc b/facebook_analyzer/__pycache__/__init__.cpython-312.pyc deleted file mode 100644 index 5e755b3..0000000 Binary files a/facebook_analyzer/__pycache__/__init__.cpython-312.pyc and /dev/null differ diff --git a/facebook_analyzer/__pycache__/fake_profile_detector.cpython-312.pyc b/facebook_analyzer/__pycache__/fake_profile_detector.cpython-312.pyc deleted file mode 100644 index df84fe0..0000000 Binary files a/facebook_analyzer/__pycache__/fake_profile_detector.cpython-312.pyc and /dev/null differ diff --git a/facebook_analyzer/__pycache__/phishing_detector.cpython-312.pyc b/facebook_analyzer/__pycache__/phishing_detector.cpython-312.pyc deleted file mode 100644 index 289cd83..0000000 Binary files a/facebook_analyzer/__pycache__/phishing_detector.cpython-312.pyc and /dev/null differ diff --git a/facebook_analyzer/fake_profile_detector.py b/facebook_analyzer/fake_profile_detector.py deleted file mode 100644 index 38872e9..0000000 --- a/facebook_analyzer/fake_profile_detector.py +++ /dev/null @@ -1,199 +0,0 @@ -# facebook_analyzer/fake_profile_detector.py - -import webbrowser - -# Common indicators of fake profiles. -# Each indicator can have a 'weight' for a simple scoring system. -# 'prompt' is what the user will be asked. -# 'type' can be 'yes_no', 'numeric', 'text_analysis' (future), etc. -# 'details_if_yes' can provide more context or ask for more info if the user answers 'yes'. -FAKE_PROFILE_INDICATORS = [ - { - "id": "profile_picture_generic", - "prompt": "Is the profile picture generic, a stock photo, an illustration, or of a celebrity (i.e., not a clear photo of a unique, real person)?", - "type": "yes_no", - "weight_if_yes": 2, - "details_if_yes": "Generic or stolen profile pictures are common for fake accounts." - }, - { - "id": "profile_picture_reverse_search", - "prompt": "Have you tried a reverse image search (e.g., Google Images, TinEye) on the profile picture? If so, did it show the image is widely used, a stock photo, or belongs to someone else?", - "type": "yes_no", - "weight_if_yes": 3, - "details_if_yes": "Reverse image search can often quickly identify stolen or common stock photos." - }, - { - "id": "account_age_very_new", - "prompt": "Does the profile seem very new with little history (e.g., join date is recent, few old posts)? (Requires manual check on the profile)", - "type": "yes_no", - "weight_if_yes": 1, - "details_if_yes": "While not definitive, many fake accounts are newly created." - }, - { - "id": "few_posts_or_activity", - "prompt": "Does the profile have very few posts, photos, or other activity over its lifespan? (Requires manual check)", - "type": "yes_no", - "weight_if_yes": 1, - "details_if_yes": "Lack of genuine activity can be a sign." - }, - { - "id": "generic_or_copied_posts", - "prompt": "Are the posts (if any) generic, nonsensical, repetitive, or seem copied from other sources? (Requires manual check)", - "type": "yes_no", - "weight_if_yes": 2, - "details_if_yes": "Content that isn't original or personal is suspicious." - }, - { - "id": "friend_count_mismatch", - "prompt": "Does the profile have a very high number of friends but very little engagement (likes/comments) on their posts, or an unusually low number of friends for a long-standing account? (Requires manual check)", - "type": "yes_no", - "weight_if_yes": 1, - "details_if_yes": "Unusual friend counts or activity ratios can be indicators." - }, - { - "id": "poor_grammar_spelling", - "prompt": "Is the language used in the profile's 'About' section or posts consistently poor in grammar or spelling (beyond typical typos)? (Requires manual check)", - "type": "yes_no", - "weight_if_yes": 1, - "details_if_yes": "Often, hastily created fake profiles have noticeable language issues." - }, - { - "id": "about_section_sparse_or_inconsistent", - "prompt": "Is the 'About' section very sparse, missing key information (like education, work), or contains information that seems inconsistent or overly glamorous/fake? (Requires manual check)", - "type": "yes_no", - "weight_if_yes": 2, - "details_if_yes": "Incomplete or suspicious 'About' information is a red flag." - }, - { - "id": "mutual_friends_suspicious", - "prompt": "If you have mutual friends, do those mutual connections seem legitimate or are they also suspicious-looking profiles?", - "type": "yes_no", - "weight_if_yes": 1, - "details_if_yes": "Fake accounts often connect with other fake accounts." - }, - { - "id": "pressure_or_strange_requests", - "prompt": "Has this profile sent you messages that pressure you for information, money, or to click suspicious links shortly after connecting?", - "type": "yes_no", - "weight_if_yes": 3, - "details_if_yes": "This is a strong indicator of a malicious fake account." - } -] - -def guide_reverse_image_search(image_url=None): - """Opens browser tabs to guide the user through reverse image search.""" - print("\n--- Guiding Reverse Image Search ---") - print("You can use services like Google Images or TinEye to check if a profile picture is used elsewhere.") - if image_url: - print(f"If you have a direct URL for the image: {image_url}") - google_url = f"https://images.google.com/searchbyimage?image_url={image_url}" - tineye_url = f"https://tineye.com/search?url={image_url}" - print(f"Attempting to open Google Images: {google_url}") - webbrowser.open(google_url) - print(f"Attempting to open TinEye: {tineye_url}") - webbrowser.open(tineye_url) - else: - print("If you have the image saved, you can upload it to these sites:") - print("Google Images: https://images.google.com/ (click the camera icon)") - webbrowser.open("https://images.google.com/") - print("TinEye: https://tineye.com/") - webbrowser.open("https://tineye.com/") - print("Look for whether the image is a common stock photo, belongs to a different person, or appears on many unrelated profiles.") - input("Press Enter to continue after performing your search...") - - -def analyze_profile_based_on_user_input(profile_url): - """ - Guides the user through a checklist to assess if a Facebook profile is fake. - Does NOT scrape any data. Relies on user observation. - """ - print(f"\n--- Analyzing Facebook Profile (Manual Check) ---") - print(f"Please open the Facebook profile in your browser: {profile_url}") - print("You will be asked a series of questions based on your observations.") - print("This tool does NOT access Facebook directly or scrape any data.") - webbrowser.open(profile_url) # Open for user convenience - - user_responses = {} - total_score = 0 - positive_indicators = [] - - # Ask about reverse image search first - perform_ris = input("Do you want guidance to perform a reverse image search on the profile picture? (yes/no): ").strip().lower() - if perform_ris == 'yes': - img_url_known = input("Do you have a direct URL for the profile image? (yes/no): ").strip().lower() - if img_url_known == 'yes': - actual_img_url = input("Please paste the direct image URL: ").strip() - guide_reverse_image_search(actual_img_url) - else: - guide_reverse_image_search() - print("Now, let's answer the question about the reverse image search based on your findings.") - - - for indicator in FAKE_PROFILE_INDICATORS: - while True: - answer = input(f"{indicator['prompt']} (yes/no): ").strip().lower() - if answer in ['yes', 'no']: - user_responses[indicator['id']] = answer - if answer == 'yes': - total_score += indicator['weight_if_yes'] - positive_indicators.append(f"- {indicator['prompt']} ({indicator['details_if_yes']})") - break - else: - print("Invalid input. Please answer 'yes' or 'no'.") - - print("\n--- Fake Profile Analysis Results ---") - print(f"Profile URL: {profile_url}") - - if not positive_indicators: - print("Based on your answers, no common fake profile indicators were strongly identified.") - print("However, always remain cautious.") - else: - print("The following indicators suggestive of a fake profile were noted based on your input:") - for pi in positive_indicators: - print(pi) - - print(f"\nOverall 'suspicion score': {total_score}") - if total_score == 0: - print("Assessment: No strong indicators noted from your input.") - elif total_score <= 3: - print("Assessment: Low likelihood of being fake based on your input, but remain cautious.") - elif total_score <= 6: - print("Assessment: Medium likelihood. Some indicators suggest this profile could be fake. Exercise caution.") - elif total_score <= 9: - print("Assessment: High likelihood. Several indicators suggest this profile may be fake. High caution advised.") - else: - print("Assessment: Very high likelihood. Many strong indicators suggest this profile is likely fake. Avoid interaction and consider reporting.") - - print("\nDisclaimer:") - print("This analysis is based SOLELY on your manual observations and answers to the checklist.") - print("It is not a definitive judgment. False positives and negatives are possible.") - print("Always use your best judgment when interacting with profiles online.") - print("If you suspect a profile is fake and malicious, consider reporting it to Facebook through their official channels.") - - return { - "profile_url": profile_url, - "score": total_score, - "positive_indicators_details": positive_indicators, - "user_responses": user_responses - } - -if __name__ == '__main__': - print("Fake Profile Detector - Manual Checklist Tool") - print("IMPORTANT: This tool does NOT access Facebook or scrape data.") - print("It guides YOU to manually check a profile and answer questions.") - print("------------------------------------------------------------") - - # Example of how it would be called: - # First, ensure the user is aware of the process for reverse image search, as it's a common first step. - # For the test, we'll simulate this. - - test_profile_url = input("Enter a Facebook profile URL to simulate analyzing (e.g., https://www.facebook.com/some.profile): ").strip() - if not test_profile_url: - print("No URL entered, exiting.") - else: - # In a real CLI, you might ask about reverse image search separately first, or integrate it. - # For this direct test, the function itself will ask. - analysis = analyze_profile_based_on_user_input(test_profile_url) - # print("\nFull analysis object (for debugging):") - # import json - # print(json.dumps(analysis, indent=2)) diff --git a/facebook_analyzer/phishing_detector.py b/facebook_analyzer/phishing_detector.py deleted file mode 100644 index f5b65ed..0000000 --- a/facebook_analyzer/phishing_detector.py +++ /dev/null @@ -1,206 +0,0 @@ -import re - -# Keywords common in phishing messages -PHISHING_KEYWORDS = [ - "verify your account", "update your details", "confirm your identity", - "login required", "secure your account", "account suspended", - "unusual activity", "security alert", "important notification", - "action required", "limited time offer", "winner", "prize", - "confidential", "urgent", "immediate attention", "access restricted", - "card declined", "payment issue", "invoice", "refund" -] - -# Patterns for suspicious URLs -# Order matters: more specific/dangerous patterns should come first. -SUSPICIOUS_URL_PATTERNS = [ - # Attempts to impersonate legitimate domains by using them as subdomains of a malicious domain - # e.g., facebook.com.malicious.com, login-facebook.com-site.org - r"https?://(?:[a-z0-9\-]+\.)*(?:facebook|fb|instagram|whatsapp)\.com\.[a-z0-9\-]+\.[a-z]+", - r"https?://(?:[a-z0-9\-]+\.)*facebook-[a-z0-9\-]+\.[a-z]+", - r"https?://(?:[a-z0-9\-]+\.)*fb-[a-z0-9\-]+\.[a-z]+", - # Common URL shorteners (can be legitimate but often used in phishing) - r"https?://bit\.ly", - r"https?://goo\.gl", - r"https?://t\.co", # Twitter shortener, often abused - # IP Address URLs - r"https?://\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}", - # Generic keywords in domain that are often suspicious if not part of a known legit service - # e.g., "login", "secure", "account", "update" in a non-standard TLD or unfamiliar domain - r"https?://[^/]*(?:login|secure|account|update|verify|support|admin)[^/]*\.(?:biz|info|tk|ml|ga|cf|gq|xyz|club|top|loan|work|online|site)", - # Very long subdomains or many hyphens (common obfuscation) - r"https?://(?:[a-z0-9\-]+\.){4,}", # 4 or more subdomains - r"https?://[^/]*\-.*\-.*\-.*[a-z]+", # multiple hyphens in domain part -] - -LEGITIMATE_DOMAINS = [ - "facebook.com", - "www.facebook.com", - "m.facebook.com", - "fb.com", # Official Facebook shortener - "www.fb.com", - "instagram.com", - "www.instagram.com", - "whatsapp.com", - "www.whatsapp.com", - "google.com", # For test cases - "www.google.com", - "amazon.com", # For test cases - "www.amazon.com" -] - -def extract_urls(text): - """Extracts URLs from a given text.""" - url_pattern = r'https?://[^\s<>"]+|www\.[^\s<>"]+' - return re.findall(url_pattern, text) - -def get_domain_from_url(url): - """Extracts the domain (e.g., 'example.com') from a URL.""" - if "://" in url: - domain = url.split("://")[1].split("/")[0].split("?")[0] - else: # Handles www.example.com cases without http(s) - domain = url.split("/")[0].split("?")[0] - return domain.lower() - -def is_url_suspicious(url): - """ - Checks if a URL is suspicious. - Returns a tuple: (bool_is_suspicious, reason_string) - """ - normalized_url_for_pattern_matching = url.lower() - domain = get_domain_from_url(url) - - # 1. Check against explicit legitimate domains - # This is a strong signal that it *might* be okay, but phishing can still occur on legit sites (e.g., compromised page). - # However, for this tool, if the *domain itself* is legit, we'll primarily rely on other indicators for now. - if domain in LEGITIMATE_DOMAINS: - # We could add checks here for suspicious paths on legitimate domains, - # but that's more complex. For now, if the core domain is legit, - # we won't flag it based on domain alone. - # Let's still check if it matches any *very specific* impersonation patterns - # that might accidentally include a legit domain name within them. - for pattern in [ - r"https?://(?:[a-z0-9\-]+\.)*(?:facebook|fb|instagram|whatsapp)\.com\.[a-z0-9\-]+\.[a-z]+", #e.g. facebook.com.hacker.com - r"https?://(?:[a-z0-9\-]+\.)*facebook-[a-z0-9\-]+\.[a-z]+" #e.g. my-facebook-login.hacker.com - ]: - if re.search(pattern, normalized_url_for_pattern_matching, re.IGNORECASE): - # Check if the *actual domain* is the legit one, not just contained. - # e.g. "facebook.com.hacker.com" contains "facebook.com" but domain is "hacker.com" - if not domain.endswith("facebook.com"): # Simplified check for this example - return True, f"URL impersonates a legitimate domain: {pattern}" - return False, "URL domain is on the legitimate list." - - # 2. Check against known suspicious patterns (these should be more specific) - for pattern in SUSPICIOUS_URL_PATTERNS: - if re.search(pattern, normalized_url_for_pattern_matching, re.IGNORECASE): - return True, f"URL matches suspicious pattern: {pattern}" - - # 3. Heuristic: Check if a known legitimate domain name is *part* of the domain, - # but the domain itself is NOT on the legitimate list. - # E.g., "facebook-login.some-other-site.com" - for legit_substring in ["facebook", "fb", "instagram", "whatsapp"]: - if legit_substring in domain: - # We already checked if `domain` is in `LEGITIMATE_DOMAINS`. - # So if we're here, it means `legit_substring` is in `domain`, but `domain` itself is not legit. - return True, f"URL contains name of a legitimate service ('{legit_substring}') but is not an official domain." - - return False, "URL does not match common suspicious patterns and is not on the explicit legitimate list." - - -def analyze_message_for_phishing(message_text): - """ - Analyzes a message for phishing indicators. - Returns a dictionary with findings. - """ - findings = { - "score": 0, # Overall phishing likelihood score (higher is more suspicious) - "keywords_found": [], - "suspicious_urls_found": [], - "urls_extracted": [], - "summary": "" - } - - # 1. Analyze text for keywords - message_lower = message_text.lower() - for keyword in PHISHING_KEYWORDS: - if keyword in message_lower: - findings["keywords_found"].append(keyword) - findings["score"] += 1 - - # 2. Extract and analyze URLs - urls = extract_urls(message_text) - findings["urls_extracted"] = urls - for url in urls: - is_susp, reason = is_url_suspicious(url) - if is_susp: - findings["suspicious_urls_found"].append({"url": url, "reason": reason}) - findings["score"] += 2 # Higher weight for suspicious URLs - - # 3. Generate summary - if not findings["keywords_found"] and not findings["suspicious_urls_found"]: - findings["summary"] = "No immediate phishing indicators found. However, always exercise caution with links and requests for information." - else: - summary_parts = [] - if findings["keywords_found"]: - summary_parts.append(f"Found {len(findings['keywords_found'])} suspicious keyword(s): {', '.join(findings['keywords_found'])}.") - if findings["suspicious_urls_found"]: - summary_parts.append(f"Found {len(findings['suspicious_urls_found'])} suspicious URL(s).") - for sus_url in findings["suspicious_urls_found"]: - summary_parts.append(f" - {sus_url['url']} (Reason: {sus_url['reason']})") - - findings["summary"] = " ".join(summary_parts) - if findings["score"] > 0: - findings["summary"] += f" Overall phishing score: {findings['score']} (higher is more suspicious)." - - - return findings - -if __name__ == '__main__': - # Example Usage - original_test_messages = [ - ("URGENT: Your Facebook account has unusual activity. Please verify your account now by clicking http://facebook.security-update.com/login to avoid suspension.", "Original 1"), - ("Hey, check out this cool site: www.google.com", "Original 2"), - ("Your package is waiting for delivery. Update your shipping details here: http://bit.ly/fakepackage", "Original 3"), - ("Hi, this is your bank. We need you to confirm your identity due to a login required. Please visit https://mybank.secure-access-point.net/confirm", "Original 4"), - ("A login to your account from a new device was detected. If this wasn't you, please secure your account at http://123.45.67.89/facebook_login", "Original 5"), - ("Click here to claim your prize! http://winner.com/prize-claim-form-xyz", "Original 6"), - ("Official communication from Facebook: Please review our new terms at https://facebook.com/terms. This is important for your account security.", "Original 7") - ] - - additional_test_messages = [ - ("Security Alert! Update your info at http://facebook.com.hacker.com and also check this http://bit.ly/anotherlink", "Additional 1: Multiple suspicious URLs"), - ("URGENT: verify your account at https://facebook.com/security/alerts - this is a real link, but also check http://mysecurity-fb-check.com", "Additional 2: Mix of legit FB URL and suspicious one with keywords"), - ("Hello there, how are you doing today?", "Additional 3: No keywords, no URLs"), - ("Important security update from Facebook. Please login at https://www.facebook.com to review your settings. Your account safety is our priority.", "Additional 4: Keywords but legit URL"), - ("Check this out: http://bit.ly/legitGoogleDoc - this could be a legit shortened link (hard to tell without unshortening)", "Additional 5: URL shortener, potentially legit content") - ] - - all_test_messages = original_test_messages + additional_test_messages - - for i, (msg, label) in enumerate(all_test_messages): - print(f"--- Analyzing Message ({label}) ---") - print(f"Message: {msg}") - analysis_result = analyze_message_for_phishing(msg) - print(f"Score: {analysis_result['score']}") - print(f"Keywords: {analysis_result['keywords_found']}") - print(f"Suspicious URLs: {analysis_result['suspicious_urls_found']}") - print(f"All URLs: {analysis_result['urls_extracted']}") - print(f"Summary: {analysis_result['summary']}") - print("-" * 30 + "\n") - - # Test URL suspicion logic directly - print("\n--- Testing URL Suspicion Logic ---") - test_urls = [ - "http://facebook.com.malicious.com/login.html", - "https://www.facebook.com/officialpage", - "http://fb.com-security-alert.com", - "https://legit-service.com/facebook_integration", # Might be ok - "http://192.168.1.10/phish", - "https.google.com", - "www.amazon.com/deals", - "http://bit.ly/randomstuff", - "https://totally-not-facebook.com", - "http://facebook.com" # Should not be suspicious by default - ] - for url in test_urls: - is_susp, reason = is_url_suspicious(url) - print(f"URL: {url} -> Suspicious: {is_susp}, Reason: {reason}") diff --git a/scam_detector/__init__.py b/scam_detector/__init__.py deleted file mode 100644 index f97bd31..0000000 --- a/scam_detector/__init__.py +++ /dev/null @@ -1,18 +0,0 @@ -# This file makes 'scam_detector' a Python package. - -# Expose constants and potentially functions if needed by other modules directly -from .heuristics import ( - URGENCY_KEYWORDS, - SENSITIVE_INFO_KEYWORDS, - TOO_GOOD_TO_BE_TRUE_KEYWORDS, - GENERIC_GREETINGS, - TECH_SUPPORT_SCAM_KEYWORDS, - PAYMENT_KEYWORDS, - URL_PATTERN, - SUSPICIOUS_TLDS, - CRYPTO_ADDRESS_PATTERNS, - PHONE_NUMBER_PATTERN, - HEURISTIC_WEIGHTS -) - -from .analyzer import analyze_text_for_scams diff --git a/scam_detector/__pycache__/__init__.cpython-312.pyc b/scam_detector/__pycache__/__init__.cpython-312.pyc deleted file mode 100644 index f9c7c31..0000000 Binary files a/scam_detector/__pycache__/__init__.cpython-312.pyc and /dev/null differ diff --git a/scam_detector/__pycache__/analyzer.cpython-312.pyc b/scam_detector/__pycache__/analyzer.cpython-312.pyc deleted file mode 100644 index 9c2bc3f..0000000 Binary files a/scam_detector/__pycache__/analyzer.cpython-312.pyc and /dev/null differ diff --git a/scam_detector/__pycache__/heuristics.cpython-312.pyc b/scam_detector/__pycache__/heuristics.cpython-312.pyc deleted file mode 100644 index e30fd50..0000000 Binary files a/scam_detector/__pycache__/heuristics.cpython-312.pyc and /dev/null differ diff --git a/scam_detector/analyzer.py b/scam_detector/analyzer.py deleted file mode 100644 index 0a1e467..0000000 --- a/scam_detector/analyzer.py +++ /dev/null @@ -1,211 +0,0 @@ -import re -from urllib.parse import urlparse -from .heuristics import ( - URGENCY_KEYWORDS, - SENSITIVE_INFO_KEYWORDS, - TOO_GOOD_TO_BE_TRUE_KEYWORDS, - GENERIC_GREETINGS, - TECH_SUPPORT_SCAM_KEYWORDS, - PAYMENT_KEYWORDS, - URL_PATTERN, - SUSPICIOUS_TLDS, - CRYPTO_ADDRESS_PATTERNS, - PHONE_NUMBER_PATTERN, - HEURISTIC_WEIGHTS -) - -# Pre-compile a regex for suspicious TLDs for efficiency if used frequently -# This creates a pattern like: \.(xyz|top|loan|club|...)$ -# Ensure TLDs are escaped if they contain special regex characters (none in current list) -SUSPICIOUS_TLD_REGEX = re.compile(r"\.(" + "|".join(tld.lstrip('.') for tld in SUSPICIOUS_TLDS) + r")$", re.IGNORECASE) - -# Keywords that might appear in URLs that are suspicious (especially if not on a primary domain) -SUSPICIOUS_URL_PATH_KEYWORDS = ["login", "verify", "account", "secure", "update", "signin", "banking", "password"] - - -def analyze_text_for_scams(text_content): - """ - Analyzes a block of text content for various scam indicators. - - Args: - text_content (str): The text to analyze. - - Returns: - dict: A dictionary containing: - 'score' (float): An overall scam likelihood score. - 'indicators_found' (list): A list of strings describing found indicators. - 'urls_analyzed' (list): A list of dicts for each URL found and its analysis. - """ - if not text_content: - return {"score": 0.0, "indicators_found": [], "urls_analyzed": []} - - text_lower = text_content.lower() # For case-insensitive keyword matching - score = 0.0 - indicators_found = [] - urls_analyzed_details = [] - - # 1. Keyword-based checks - keyword_checks = { - "URGENCY": URGENCY_KEYWORDS, - "SENSITIVE_INFO": SENSITIVE_INFO_KEYWORDS, - "TOO_GOOD_TO_BE_TRUE": TOO_GOOD_TO_BE_TRUE_KEYWORDS, - "GENERIC_GREETING": GENERIC_GREETINGS, - "TECH_SUPPORT": TECH_SUPPORT_SCAM_KEYWORDS, - "PAYMENT_REQUEST": PAYMENT_KEYWORDS, - } - - for category, keywords in keyword_checks.items(): - for keyword in keywords: - if keyword in text_lower: - message = f"Presence of '{category.replace('_', ' ').title()}' keyword: '{keyword}'" - indicators_found.append(message) - score += HEURISTIC_WEIGHTS.get(category, 1.0) - # Optimization: could break after first keyword in category if only counting category once - # For now, sum weights for each keyword hit to emphasize multiple indicators. - - # 2. Regex-based checks - # URLs - found_urls = URL_PATTERN.findall(text_content) - for url_str in found_urls: - url_analysis = {"url": url_str, "is_suspicious": False, "reasons": []} - - parsed_url = None - try: - # Add scheme if missing for urlparse - if not url_str.startswith(('http://', 'https://', 'ftp://')): - temp_url_str_for_parse = 'http://' + url_str - else: - temp_url_str_for_parse = url_str - parsed_url = urlparse(temp_url_str_for_parse) - except Exception as e: - # print(f"Warning: Could not parse URL '{url_str}': {e}") - url_analysis["reasons"].append(f"Could not parse URL string.") - # Continue with regex checks on url_str itself if parsing fails - - # Check for suspicious TLDs - domain_to_check = parsed_url.hostname if parsed_url else url_str # Fallback to full string if parse failed - if domain_to_check and SUSPICIOUS_TLD_REGEX.search(domain_to_check): - reason = f"URL uses a potentially suspicious TLD (e.g., {SUSPICIOUS_TLD_REGEX.search(domain_to_check).group(0)})" - url_analysis["reasons"].append(reason) - url_analysis["is_suspicious"] = True - score += HEURISTIC_WEIGHTS.get("SUSPICIOUS_TLD", 1.0) - - # Check for suspicious keywords in URL path/query or domain itself - # (e.g. yourbank.com.suspicious.xyz/login or secure-payment-verify.com) - # This is a simple check; more advanced would involve checking against known legit domains. - for keyword in SUSPICIOUS_URL_PATH_KEYWORDS: - if keyword in url_str.lower(): # Check the whole URL string - # Avoid flagging legit sites like "myaccount.google.com" just for "account" - # This needs refinement: only flag if domain is not a known major one. - # For MVP, this check is broad. - is_known_major_domain = False - if parsed_url and parsed_url.hostname: - known_domains = ["google.com", "facebook.com", "amazon.com", "apple.com", "microsoft.com", "paypal.com"] # Example list - for kd in known_domains: - if parsed_url.hostname.endswith(kd): - is_known_major_domain = True - break - - if not is_known_major_domain: - reason = f"URL contains suspicious keyword: '{keyword}'" - url_analysis["reasons"].append(reason) - url_analysis["is_suspicious"] = True - score += HEURISTIC_WEIGHTS.get("SUSPICIOUS_URL_KEYWORD", 1.0) - break # Only count one such keyword per URL for now - - if url_analysis["is_suspicious"]: - indicators_found.append(f"Suspicious URL found: {url_str} (Reasons: {'; '.join(url_analysis['reasons'])})") - urls_analyzed_details.append(url_analysis) - - - # Crypto Addresses - for crypto_name, pattern in CRYPTO_ADDRESS_PATTERNS.items(): - if pattern.search(text_content): # Search original text, not lowercased, as patterns might be case-sensitive - message = f"Potential {crypto_name} cryptocurrency address found." - indicators_found.append(message) - score += HEURISTIC_WEIGHTS.get("CRYPTO_ADDRESS", 2.0) - - # Phone Numbers (Presence alone is not a strong indicator, context matters, which is hard for MVP) - # For MVP, we'll just note if one is found. The weighting is important here. - if PHONE_NUMBER_PATTERN.search(text_content): - message = "Phone number detected in text." - indicators_found.append(message) - score += HEURISTIC_WEIGHTS.get("PHONE_NUMBER_UNSOLICITED", 0.25) # Low weight - - # TODO: Add more heuristics like: - # - Grammar/spelling (complex, likely requires external library for good results) - # - Sense of urgency combined with financial request - # - Analysis of sender (if email headers were available) - - return { - "score": round(score, 2), - "indicators_found": indicators_found, - "urls_analyzed": urls_analyzed_details - } - -if __name__ == '__main__': - test_cases = [ - { - "name": "Phishing Attempt", - "text": "Dear Customer, your account is suspended due to unusual activity. Please verify your password at http://yourbank.secure-login-update.xyz/verify immediately. Act now to avoid closure.", - "expected_min_score": 5.0, # URGENCY, SENSITIVE_INFO, SUSPICIOUS_TLD, SUSPICIOUS_URL_KEYWORD - }, - { - "name": "Prize Scam", - "text": "CONGRATULATIONS YOU WON!!! You've won a free iPhone! Claim your reward now at www.totally-real-prize.top/claim-123. Provide your details to receive your prize.", - "expected_min_score": 4.0, # TOO_GOOD_TO_BE_TRUE, SENSITIVE_INFO, SUSPICIOUS_TLD - }, - { - "name": "Tech Support Scam", - "text": "Microsoft Support Alert: Your computer is infected with a virus! Call immediately 1-800-FAKE-TECH for a technician to get remote access. Your IP address compromised.", - "expected_min_score": 4.0, # TECH_SUPPORT, URGENCY, PHONE_NUMBER - }, - { - "name": "Crypto Payment Scam", - "text": "Urgent payment needed for outstanding invoice. Send 0.5 BTC to 1A1zP1eP5QGefi2DMPTfTL5SLmv7DivfNa to settle your account.", - "expected_min_score": 4.0, # URGENCY, PAYMENT_REQUEST, CRYPTO_ADDRESS - }, - { - "name": "Legitimate-sounding Message", - "text": "Hello John, just a reminder about our meeting tomorrow at 10 AM. Please find the agenda attached. Website: www.ourcompany.com. Call me if you have questions: (123) 456-7890", - "expected_max_score": 2.0, # Might pick up phone number, or URL if not whitelisted - }, - { - "name": "Generic Greeting Email", - "text": "Dear valued customer, We are updating our terms of service. No action needed from your side. Visit https://realcompany.com/terms for details.", - "expected_max_score": 1.0, # GENERIC_GREETING - }, - { - "name": "URL with suspicious keyword but known domain", - "text": "Please login to your account at https://myaccount.google.com/login-activity to check recent activity.", - "expected_max_score": 0.5, # Should not flag "login" or "account" heavily due to known domain - } - ] - - for case in test_cases: - print(f"\n--- Test Case: {case['name']} ---") - print(f"Text: \"{case['text'][:100]}...\"" if len(case['text']) > 100 else f"Text: \"{case['text']}\"") - results = analyze_text_for_scams(case['text']) - print(f"Score: {results['score']}") - print("Indicators:") - for indicator in results['indicators_found']: - print(f" - {indicator}") - if results['urls_analyzed']: - print("URLs Analyzed:") - for url_info in results['urls_analyzed']: - print(f" - URL: {url_info['url']}, Suspicious: {url_info['is_suspicious']}, Reasons: {url_info.get('reasons', [])}") - - if "expected_min_score" in case: - assert results['score'] >= case['expected_min_score'], f"Score {results['score']} was less than expected min {case['expected_min_score']}" - print(f"Assertion: Score >= {case['expected_min_score']} PASSED") - if "expected_max_score" in case: - assert results['score'] <= case['expected_max_score'], f"Score {results['score']} was more than expected max {case['expected_max_score']}" - print(f"Assertion: Score <= {case['expected_max_score']} PASSED") - - print("\n--- Test with empty text ---") - empty_results = analyze_text_for_scams("") - assert empty_results['score'] == 0.0 - assert not empty_results['indicators_found'] - print("Empty text test passed.") - - print("\nCore analysis engine tests completed.") diff --git a/scam_detector/heuristics.py b/scam_detector/heuristics.py deleted file mode 100644 index d43c48c..0000000 --- a/scam_detector/heuristics.py +++ /dev/null @@ -1,163 +0,0 @@ -import re - -# --- Keyword Lists (case-insensitive matching will be applied) --- - -# Keywords/phrases indicating urgency or pressure -URGENCY_KEYWORDS = [ - "urgent", "immediate action required", "act now", "limited time", - "account suspended", "account will be closed", "final warning", - "security alert", "unusual activity detected", "important notification", - "don't delay", "expires soon", "offer ends today", "last chance", - "your subscription will be cancelled", "payment declined" # Removed "action needed" -] - -# Keywords/phrases related to requests for sensitive information -SENSITIVE_INFO_KEYWORDS = [ - "verify your password", "confirm your password", "update your password", - "password", "username", "login details", "credentials", - "social security number", "ssn", - "bank account", "account number", "routing number", "credit card number", - "cvv", "pin number", "mother's maiden name", "security question", - "confirm your details", "update your information", "verify your account", - "provide your details", "personal information" -] - -# Keywords/phrases indicating too-good-to-be-true offers, prizes, etc. -TOO_GOOD_TO_BE_TRUE_KEYWORDS = [ - "you have won", "you've won", "congratulations you won", "winner", "prize", - "free gift", "claim your reward", "lottery", "sweepstakes", - "guaranteed", "risk-free", "earn money fast", "work from home easy", - "investment opportunity", "high return", "get rich quick", - "inheritance", " unclaimed funds", "nigerian prince" # Classic ones -] - -# Generic greetings/salutations that can be suspicious in unsolicited contexts -GENERIC_GREETINGS = [ - "dear customer", "dear user", "dear valued customer", "dear account holder", - "dear friend", "hello sir/madam", "greetings" - # Note: "Hello" or "Hi" by themselves are too common to be reliably suspicious -] - -# Keywords often found in tech support scams -TECH_SUPPORT_SCAM_KEYWORDS = [ - "microsoft support", "windows support", "apple support", - "virus detected", "malware found", "your computer is infected", - "call immediately", "technician", "remote access", "ip address compromised" -] - -# Keywords related to payment requests or financial transactions -PAYMENT_KEYWORDS = [ - "payment", "invoice", "bill", "outstanding balance", "transfer funds", - "wire transfer", "gift card", "cryptocurrency", "bitcoin", "western union", "moneygram", - "urgent payment needed", "settle your account" -] - - -# --- Regular Expression Patterns --- - -# Basic URL detection - this is simple and can be expanded -# It aims to find things that look like URLs. More sophisticated parsing will be needed -# if we want to break them down further or check TLDs more accurately here. -URL_PATTERN = re.compile( - r'(?:(?:https?|ftp):\/\/|www\.)' # http://, https://, ftp://, www. - r'(?:\([-A-Z0-9+&@#\/%=~_|$?!:,.]*\)|[-A-Z0-9+&@#\/%=~_|$?!:,.])*' # Non-space chars in URL - r'(?:\([-A-Z0-9+&@#\/%=~_|$?!:,.]*\)|[A-Z0-9+&@#\/%=~_|$])', # Last char - re.IGNORECASE -) - -# Suspicious Top-Level Domains (TLDs) - This list is not exhaustive! -# Scammers often use newer, cheaper, or less common TLDs. -SUSPICIOUS_TLDS = [ - '.xyz', '.top', '.loan', '.club', '.work', '.online', '.biz', '.info', - '.icu', '.gq', '.cf', '.tk', '.ml', # Often free TLDs abused - '.link', '.click', '.site', '.live', '.buzz', '.stream', '.download', - # Sometimes, very long TLDs can be suspicious if combined with other factors -] -# Regex to check if a URL ends with one of these TLDs -# (Needs to be used after extracting the domain from a URL) -# Example: r"\.(xyz|top|loan)$" - will be built dynamically in analyzer - -# Pattern for detecting strings that look like cryptocurrency addresses -CRYPTO_ADDRESS_PATTERNS = { - "BTC": re.compile(r'\b(1[a-km-zA-HJ-NP-Z1-9]{25,34}|3[a-km-zA-HJ-NP-Z1-9]{25,34}|bc1[a-zA-HJ-NP-Z0-9]{25,90})\b'), - "ETH": re.compile(r'\b(0x[a-fA-F0-9]{40})\b'), - # Add more for other common cryptos like LTC, XMR if needed -} - -# Pattern for phone numbers (very generic, adjust for specific country needs if possible) -# This is a basic example and might catch non-phone numbers or miss some valid ones. -# It aims for sequences of 7-15 digits, possibly with spaces, hyphens, or parentheses. -PHONE_NUMBER_PATTERN = re.compile( - r'(\+?\d{1,3}[-.\s]?)?(\(?\d{2,4}\)?[-.\s]?)?(\d{3,4}[-.\s]?\d{3,4})' # Simplified - # r'(?:(?:\+|00)[1-9]\d{0,2}[-.\s]?)?(?:(?:\(\d{1,4}\)|\d{1,4})[-.\s]?)?(?:\d{1,4}[-.\s]?){1,4}\d{1,4}' -) - - -# --- Scoring Weights (Example - can be tuned) --- -# These weights can be used by the analyzer to calculate a scam score. -HEURISTIC_WEIGHTS = { - "URGENCY": 1.5, - "SENSITIVE_INFO": 2.5, - "TOO_GOOD_TO_BE_TRUE": 2.0, - "GENERIC_GREETING": 0.5, # Lower weight as it's a weaker indicator alone - "TECH_SUPPORT": 2.0, - "PAYMENT_REQUEST": 1.5, - "SUSPICIOUS_URL_KEYWORD": 1.0, # e.g., "login," "verify" in URL path with non-primary domain - "SUSPICIOUS_TLD": 2.0, - "CRYPTO_ADDRESS": 2.5, # Requesting crypto is often a scam indicator - "PHONE_NUMBER_UNSOLICITED": 1.0, # Presence of phone number in unsolicited mail could be for callback scam - # "GRAMMAR_SPELLING": 0.5 (If implemented) -} - - -if __name__ == '__main__': - print("--- Heuristic Definitions ---") - print(f"Loaded {len(URGENCY_KEYWORDS)} urgency keywords.") - print(f"Loaded {len(SENSITIVE_INFO_KEYWORDS)} sensitive info keywords.") - print(f"Loaded {len(TOO_GOOD_TO_BE_TRUE_KEYWORDS)} too-good-to-be-true keywords.") - print(f"Loaded {len(GENERIC_GREETINGS)} generic greetings.") - print(f"Loaded {len(TECH_SUPPORT_SCAM_KEYWORDS)} tech support scam keywords.") - print(f"Loaded {len(PAYMENT_KEYWORDS)} payment keywords.") - - print(f"\nURL Pattern: {URL_PATTERN.pattern}") - print(f"Suspicious TLDs example: {SUSPICIOUS_TLDS[:5]}") - - print("\nCrypto Address Patterns:") - for crypto, pattern in CRYPTO_ADDRESS_PATTERNS.items(): - print(f" {crypto}: {pattern.pattern}") - - print(f"\nPhone Number Pattern: {PHONE_NUMBER_PATTERN.pattern}") - - print("\nHeuristic Weights:") - for category, weight in HEURISTIC_WEIGHTS.items(): - print(f" {category}: {weight}") - - # Test URL pattern - test_text_with_urls = "Visit www.example.com or http://another-site.co.uk/path?query=1 and also https://test.xyz/secure" - found_urls = URL_PATTERN.findall(test_text_with_urls) - print(f"\nURLs found in test text: {found_urls}") - assert len(found_urls) == 3 - - # Test Crypto patterns - btc_text = "Send 1 BTC to 1A1zP1eP5QGefi2DMPTfTL5SLmv7DivfNa now!" - eth_text = "My address is 0x1234567890abcdef1234567890abcdef12345678" - no_crypto_text = "This is a normal message." - - assert CRYPTO_ADDRESS_PATTERNS["BTC"].search(btc_text) - assert CRYPTO_ADDRESS_PATTERNS["ETH"].search(eth_text) - assert not CRYPTO_ADDRESS_PATTERNS["BTC"].search(no_crypto_text) - print("Crypto address pattern tests passed.") - - # Test phone number pattern (basic) - phone_text_1 = "Call us at (123) 456-7890 for help." - phone_text_2 = "Our number is +44 20 7946 0958." - phone_text_3 = "Contact 1234567890." - no_phone_text = "No number here." - - assert PHONE_NUMBER_PATTERN.search(phone_text_1) - assert PHONE_NUMBER_PATTERN.search(phone_text_2) - assert PHONE_NUMBER_PATTERN.search(phone_text_3) - assert not PHONE_NUMBER_PATTERN.search(no_phone_text) - print("Phone number pattern tests passed (basic).") - - print("\nHeuristics module loaded and basic regex patterns tested.") diff --git a/scam_main.py b/scam_main.py deleted file mode 100644 index 2648967..0000000 --- a/scam_main.py +++ /dev/null @@ -1,101 +0,0 @@ -import argparse -import sys -from scam_detector.analyzer import analyze_text_for_scams - -def main(): - parser = argparse.ArgumentParser( - description="Text-based Scam Detection Tool. Analyzes input text for common scam indicators.", - epilog="Example: python scam_main.py --text \"Dear Customer, click http://suspicious.link/login to verify your account now!\"" - ) - - group = parser.add_mutually_exclusive_group(required=True) - group.add_argument( - "-t", "--text", - help="Text content to analyze for scams." - ) - group.add_argument( - "-f", "--file", - help="Path to a plain text file to read content from." - ) - group.add_argument( - "--stdin", - action="store_true", - help="Read text content from standard input (e.g., via pipe)." - ) - - parser.add_argument( - "-v", "--verbose", - action="store_true", - help="Enable verbose output (shows detailed URL analysis if URLs are found)." - ) - - # Add a threshold argument for a simple alert - parser.add_argument( - "--threshold", - type=float, - default=5.0, # Default threshold, can be adjusted - help="Score threshold above which a 'High Risk' warning is displayed (default: 5.0)." - ) - - args = parser.parse_args() - - input_text = "" - if args.text: - input_text = args.text - elif args.file: - try: - with open(args.file, 'r', encoding='utf-8') as f: - input_text = f.read() - except FileNotFoundError: - print(f"Error: File not found at {args.file}") - sys.exit(1) - except Exception as e: - print(f"Error reading file {args.file}: {e}") - sys.exit(1) - elif args.stdin: - print("Reading from stdin. Press Ctrl+D (Linux/macOS) or Ctrl+Z then Enter (Windows) to end input.") - input_text = sys.stdin.read() - - if not input_text.strip(): - print("Error: No input text provided to analyze.") - sys.exit(1) - - print("\nAnalyzing text...") - results = analyze_text_for_scams(input_text) - - print("\n--- Scam Analysis Results ---") - print(f"Overall Scam Likelihood Score: {results['score']}") - - if results['score'] == 0.0 and not results['indicators_found']: - print("No specific scam indicators found in the text.") - elif results['score'] < args.threshold / 2 : # Example: low risk - print("Assessment: Low risk of being a scam based on heuristics.") - elif results['score'] < args.threshold: # Example: medium risk - print("Assessment: Medium risk. Some indicators suggest caution.") - else: # High risk - print(f"WARNING: High risk! Score exceeds threshold of {args.threshold}.") - print("This content has multiple indicators commonly found in scams.") - - if results['indicators_found']: - print("\nIndicators Found:") - for indicator in results['indicators_found']: - print(f" - {indicator}") - - if args.verbose and results['urls_analyzed']: - print("\nDetailed URL Analysis:") - for url_info in results['urls_analyzed']: - print(f" - URL: {url_info['url']}") - print(f" Suspicious: {url_info['is_suspicious']}") - if url_info['reasons']: - print(f" Reasons: {'; '.join(url_info['reasons'])}") - else: - print(f" Reasons: None") - elif results['urls_analyzed'] and not args.verbose: - print("\n(Run with --verbose to see detailed URL analysis if URLs were found)") - - - print("\nDisclaimer: This tool uses heuristic-based detection and is not foolproof.") - print("Always exercise caution and use your best judgment. Do not rely solely on this tool for security decisions.") - -if __name__ == "__main__": - main() diff --git a/social_media_analyzer/__init__.py b/social_media_analyzer/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/social_media_analyzer/__pycache__/__init__.cpython-312.pyc b/social_media_analyzer/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000..4155aeb Binary files /dev/null and b/social_media_analyzer/__pycache__/__init__.cpython-312.pyc differ diff --git a/social_media_analyzer/__pycache__/fake_profile_detector.cpython-312.pyc b/social_media_analyzer/__pycache__/fake_profile_detector.cpython-312.pyc new file mode 100644 index 0000000..7b14c4b Binary files /dev/null and b/social_media_analyzer/__pycache__/fake_profile_detector.cpython-312.pyc differ diff --git a/social_media_analyzer/__pycache__/heuristics.cpython-312.pyc b/social_media_analyzer/__pycache__/heuristics.cpython-312.pyc new file mode 100644 index 0000000..ea37fce Binary files /dev/null and b/social_media_analyzer/__pycache__/heuristics.cpython-312.pyc differ diff --git a/social_media_analyzer/__pycache__/main.cpython-312.pyc b/social_media_analyzer/__pycache__/main.cpython-312.pyc new file mode 100644 index 0000000..838e2ea Binary files /dev/null and b/social_media_analyzer/__pycache__/main.cpython-312.pyc differ diff --git a/social_media_analyzer/__pycache__/scam_detector.cpython-312.pyc b/social_media_analyzer/__pycache__/scam_detector.cpython-312.pyc new file mode 100644 index 0000000..cfb3a2a Binary files /dev/null and b/social_media_analyzer/__pycache__/scam_detector.cpython-312.pyc differ diff --git a/social_media_analyzer/fake_profile_detector.py b/social_media_analyzer/fake_profile_detector.py new file mode 100644 index 0000000..fc14416 --- /dev/null +++ b/social_media_analyzer/fake_profile_detector.py @@ -0,0 +1,194 @@ +import webbrowser + +# --- Platform-Specific Advice --- +PLATFORM_SPECIFIC_ADVICE = { + "instagram": [ + "Check for a high follower count but very low engagement (likes/comments) on posts.", + "Look for accounts that exclusively post promotional content or ads.", + "Be wary of accounts that have a large number of followers but follow very few people." + ], + "tinder": [ + "Be cautious of profiles that seem 'too perfect' with professional-level photos.", + "Watch out for profiles that immediately try to move the conversation to another platform (e.g., WhatsApp).", + "Be wary of profiles with very sparse information or only a single photo." + ], + "tiktok": [ + "Check if the account has a large number of followers but the videos have very few views or likes.", + "Look for accounts that spam comments with links or promotional messages.", + "Be suspicious of accounts that use stolen or unoriginal content." + ], + "snapchat": [ + "Be careful with accounts that you don't know personally, especially if they ask for personal information.", + "Scammers may use Snapchat to send disappearing messages with malicious links." + ], + "whatsapp": [ + "Be wary of messages from unknown numbers, especially if they contain links or ask for money.", + "Check the profile picture and status of unknown contacts for anything suspicious.", + ], + "wechat": [ + "Be cautious of accounts that you don't know, especially if they ask for money or personal information.", + "Scammers may use fake accounts to impersonate friends or family." + ], + "facebook": [ + "Check the 'About' section for inconsistencies or lack of information.", + "Look at the age of the account and the history of posts.", + "Be suspicious of friend requests from people you don't know, especially if you have no mutual friends." + ] +} + + +# --- Generic Fake Profile Indicators --- +FAKE_PROFILE_INDICATORS = [ + { + "id": "profile_picture_generic", + "prompt": "Is the profile picture generic, a stock photo, an illustration, or of a celebrity?", + "weight_if_yes": 2, + "details_if_yes": "Generic or stolen profile pictures are common for fake accounts." + }, + { + "id": "profile_picture_reverse_search", + "prompt": "Have you tried a reverse image search on the profile picture? Did it show the image is widely used or belongs to someone else?", + "weight_if_yes": 3, + "details_if_yes": "Reverse image search can quickly identify stolen or common stock photos." + }, + { + "id": "account_age_very_new", + "prompt": "Does the profile seem very new with little history (e.g., recent join date, few old posts)?", + "weight_if_yes": 1, + "details_if_yes": "Many fake accounts are newly created." + }, + { + "id": "few_posts_or_activity", + "prompt": "Does the profile have very few posts, photos, or other activity over its lifespan?", + "weight_if_yes": 1, + "details_if_yes": "Lack of genuine activity can be a sign." + }, + { + "id": "generic_or_copied_posts", + "prompt": "Are the posts (if any) generic, nonsensical, repetitive, or copied from other sources?", + "weight_if_yes": 2, + "details_if_yes": "Content that isn't original or personal is suspicious." + }, + { + "id": "engagement_mismatch", + "prompt": "Is there a mismatch between the number of friends/followers and the engagement (likes/comments) on posts?", + "weight_if_yes": 1, + "details_if_yes": "Unusual ratios can be an indicator (e.g., many followers, but almost no likes)." + }, + { + "id": "poor_grammar_spelling", + "prompt": "Is the language in the profile's bio or posts consistently poor in grammar or spelling?", + "weight_if_yes": 1, + "details_if_yes": "Hastily created fake profiles often have noticeable language issues." + }, + { + "id": "about_section_sparse_or_inconsistent", + "prompt": "Is the 'About' or 'Bio' section sparse, inconsistent, or overly glamorous/fake?", + "weight_if_yes": 2, + "details_if_yes": "Incomplete or suspicious 'About' information is a red flag." + }, + { + "id": "pressure_or_strange_requests", + "prompt": "Has this profile sent messages pressuring you for information, money, or to click suspicious links?", + "weight_if_yes": 3, + "details_if_yes": "This is a strong indicator of a malicious account." + } +] + +def guide_reverse_image_search(image_url=None): + """Opens browser tabs to guide the user through reverse image search.""" + print("\n--- Guiding Reverse Image Search ---") + print("You can use services like Google Images or TinEye to check if a profile picture is used elsewhere.") + if image_url: + google_url = f"https://images.google.com/searchbyimage?image_url={image_url}" + tineye_url = f"https://tineye.com/search?url={image_url}" + print(f"Attempting to open Google Images: {google_url}") + webbrowser.open(google_url) + print(f"Attempting to open TinEye: {tineye_url}") + webbrowser.open(tineye_url) + else: + print("If you have the image saved, you can upload it to these sites:") + print("Google Images: https://images.google.com/ (click the camera icon)") + webbrowser.open("https://images.google.com/") + print("TinEye: https://tineye.com/") + webbrowser.open("https://tineye.com/") + print("Look for whether the image is a common stock photo, belongs to a different person, or appears on many unrelated profiles.") + input("Press Enter to continue after performing your search...") + +def print_platform_specific_advice(platform): + """Prints platform-specific advice to the user.""" + if platform in PLATFORM_SPECIFIC_ADVICE: + print(f"\n--- Platform-Specific Advice for {platform.capitalize()} ---") + for advice in PLATFORM_SPECIFIC_ADVICE[platform]: + print(f"- {advice}") + +def analyze_profile_based_on_user_input(profile_url, platform): + """ + Guides the user through a checklist to assess if a social media profile is fake. + """ + print(f"\n--- Analyzing {platform.capitalize()} Profile (Manual Check) ---") + print(f"Please open the profile in your browser or app: {profile_url}") + print("You will be asked a series of questions based on your observations.") + webbrowser.open(profile_url) + + print_platform_specific_advice(platform) + + user_responses = {} + total_score = 0 + positive_indicators = [] + + perform_ris = input("\nDo you want guidance to perform a reverse image search on the profile picture? (yes/no): ").strip().lower() + if perform_ris == 'yes': + img_url_known = input("Do you have a direct URL for the profile image? (yes/no): ").strip().lower() + if img_url_known == 'yes': + actual_img_url = input("Please paste the direct image URL: ").strip() + guide_reverse_image_search(actual_img_url) + else: + guide_reverse_image_search() + + for indicator in FAKE_PROFILE_INDICATORS: + while True: + answer = input(f"{indicator['prompt']} (yes/no): ").strip().lower() + if answer in ['yes', 'no']: + user_responses[indicator['id']] = answer + if answer == 'yes': + total_score += indicator['weight_if_yes'] + positive_indicators.append(f"- {indicator['prompt']} ({indicator['details_if_yes']})") + break + else: + print("Invalid input. Please answer 'yes' or 'no'.") + + print("\n--- Fake Profile Analysis Results ---") + print(f"Profile URL: {profile_url}") + + if not positive_indicators: + print("Based on your answers, no common fake profile indicators were strongly identified.") + else: + print("The following indicators suggestive of a fake profile were noted:") + for pi in positive_indicators: + print(pi) + + print(f"\nOverall 'suspicion score': {total_score}") + if total_score <= 3: + print("Assessment: Low likelihood of being fake.") + elif total_score <= 6: + print("Assessment: Medium likelihood. Exercise caution.") + else: + print("Assessment: High likelihood. High caution advised.") + + print("\nDisclaimer: This analysis is based SOLELY on your manual observations.") + print("Always use your best judgment and consider reporting suspicious profiles to the platform.") + + return { + "profile_url": profile_url, + "platform": platform, + "score": total_score, + "positive_indicators": positive_indicators, + } + +if __name__ == '__main__': + print("Fake Profile Detector - Manual Checklist Tool") + test_platform = input("Enter the social media platform to simulate analyzing (e.g., instagram): ").strip().lower() + test_profile_url = input(f"Enter a {test_platform.capitalize()} profile URL to simulate analyzing: ").strip() + if test_profile_url and test_platform: + analyze_profile_based_on_user_input(test_profile_url, test_platform) diff --git a/social_media_analyzer/heuristics.py b/social_media_analyzer/heuristics.py new file mode 100644 index 0000000..13d0218 --- /dev/null +++ b/social_media_analyzer/heuristics.py @@ -0,0 +1,136 @@ +import re + +# --- Legitimate Domains --- +# This list helps the analyzer to recognize official domains and avoid flagging them. +# It's important to be precise here. +LEGITIMATE_DOMAINS = { + "facebook": ["facebook.com", "m.facebook.com", "fb.com", "messenger.com"], + "instagram": ["instagram.com", "instagr.am"], + "whatsapp": ["whatsapp.com", "wa.me"], + "tiktok": ["tiktok.com"], + "tinder": ["tinder.com", "gotinder.com"], + "snapchat": ["snapchat.com"], + "wechat": ["wechat.com"], + "general": ["google.com", "twitter.com", "linkedin.com"] # Other common safe domains +} + + +# --- Keyword Lists (case-insensitive matching will be applied) --- + +# Keywords/phrases indicating urgency or pressure +URGENCY_KEYWORDS = [ + "urgent", "immediate action required", "act now", "limited time", + "account suspended", "account will be closed", "final warning", + "security alert", "unusual activity detected", "important notification", + "don't delay", "expires soon", "offer ends today", "last chance", + "your subscription will be cancelled", "payment declined" +] + +# Keywords/phrases related to requests for sensitive information +SENSITIVE_INFO_KEYWORDS = [ + "verify your password", "confirm your password", "update your password", + "password", "username", "login details", "credentials", + "social security number", "ssn", + "bank account", "account number", "routing number", "credit card number", + "cvv", "pin number", "mother's maiden name", "security question", + "confirm your details", "update your information", "verify your account", + "provide your details", "personal information" +] + +# Keywords/phrases indicating too-good-to-be-true offers, prizes, etc. +TOO_GOOD_TO_BE_TRUE_KEYWORDS = [ + "you have won", "you've won", "congratulations you won", "winner", "prize", + "free gift", "claim your reward", "lottery", "sweepstakes", + "guaranteed", "risk-free", "earn money fast", "work from home easy", + "investment opportunity", "high return", "get rich quick", + "inheritance", "unclaimed funds", "nigerian prince", + "free followers", "pro account for free", "verified badge" # Social media specific +] + +# Generic greetings/salutations that can be suspicious in unsolicited contexts +GENERIC_GREETINGS = [ + "dear customer", "dear user", "dear valued customer", "dear account holder", + "dear friend", "hello sir/madam", "greetings" +] + +# Keywords often found in tech support scams +TECH_SUPPORT_SCAM_KEYWORDS = [ + "microsoft support", "windows support", "apple support", + "virus detected", "malware found", "your computer is infected", + "call immediately", "technician", "remote access", "ip address compromised" +] + +# Keywords related to payment requests or financial transactions +PAYMENT_KEYWORDS = [ + "payment", "invoice", "bill", "outstanding balance", "transfer funds", + "wire transfer", "gift card", "cryptocurrency", "bitcoin", "western union", "moneygram", + "urgent payment needed", "settle your account" +] + + +# --- Regular Expression Patterns --- + +# Basic URL detection +URL_PATTERN = re.compile( + r'(?:(?:https?|ftp):\/\/|www\.)' + r'(?:\([-A-Z0-9+&@#\/%=~_|$?!:,.]*\)|[-A-Z0-9+&@#\/%=~_|$?!:,.])*' + r'(?:\([-A-Z0-9+&@#\/%=~_|$?!:,.]*\)|[A-Z0-9+&@#\/%=~_|$])', + re.IGNORECASE +) + +# Suspicious Top-Level Domains (TLDs) +SUSPICIOUS_TLDS = [ + '.xyz', '.top', '.loan', '.club', 'work', '.online', '.biz', '.info', + '.icu', '.gq', '.cf', '.tk', '.ml', + '.link', '.click', '.site', '.live', '.buzz', '.stream', '.download', +] + +# Pattern for detecting strings that look like cryptocurrency addresses +CRYPTO_ADDRESS_PATTERNS = { + "BTC": re.compile(r'\b(1[a-km-zA-HJ-NP-Z1-9]{25,34}|3[a-km-zA-HJ-NP-Z1-9]{25,34}|bc1[a-zA-HJ-NP-Z0-9]{25,90})\b'), + "ETH": re.compile(r'\b(0x[a-fA-F0-9]{40})\b'), +} + +# Pattern for phone numbers +PHONE_NUMBER_PATTERN = re.compile( + r'(\+?\d{1,3}[-.\s]?)?(\(?\d{2,4}\)?[-.\s]?)?(\d{3,4}[-.\s]?\d{3,4})' +) + +# Suspicious URL Patterns +# These patterns aim to catch URLs that impersonate legitimate domains. +SUSPICIOUS_URL_PATTERNS = [ + # Impersonation using subdomains or hyphens + r"https?://(?:[a-z0-9\-]+\.)*(?:facebook|fb|instagram|whatsapp|tiktok|tinder|snapchat|wechat)\.com\.[a-z0-9\-]+\.[a-z]+", + r"https?://(?:[a-z0-9\-]+\.)*(?:facebook|fb|instagram|whatsapp|tiktok|tinder|snapchat|wechat)-[a-z0-9\-]+\.[a-z]+", + # Common URL shorteners + r"https?://bit\.ly", + r"https?://goo\.gl", + r"https?://t\.co", + # IP Address URLs + r"https?://\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}", + # Generic suspicious keywords in the domain + r"https?://[^/]*(?:login|secure|account|update|verify|support|admin)[^/]*\.(?:biz|info|tk|ml|ga|cf|gq|xyz|club|top|loan|work|online|site)", + # Very long subdomains or many hyphens + r"https?://(?:[a-z0-9\-]+\.){4,}", + r"https?://[^/]*\-.*\-.*\-.*[a-z]+", +] + + +# --- Scoring Weights --- +HEURISTIC_WEIGHTS = { + "URGENCY": 1.5, + "SENSITIVE_INFO": 2.5, + "TOO_GOOD_TO_BE_TRUE": 2.0, + "GENERIC_GREETING": 0.5, + "TECH_SUPPORT": 2.0, + "PAYMENT_REQUEST": 1.5, + "SUSPICIOUS_URL_KEYWORD": 1.0, + "SUSPICIOUS_TLD": 2.0, + "CRYPTO_ADDRESS": 2.5, + "PHONE_NUMBER_UNSOLICITED": 1.0, + "SUSPICIOUS_URL_PATTERN": 3.0, # High weight for matching a suspicious URL pattern +} + +if __name__ == '__main__': + print("--- Heuristic Definitions ---") + # ... (rest of the test code can be added later) diff --git a/social_media_analyzer/main.py b/social_media_analyzer/main.py new file mode 100644 index 0000000..b036030 --- /dev/null +++ b/social_media_analyzer/main.py @@ -0,0 +1,61 @@ +from . import fake_profile_detector +from . import scam_detector + +def main(): + """Main function to run the social media analyzer.""" + print("--- Social Media Analyzer ---") + print("This tool helps you analyze social media profiles and messages for potential scams.") + + platforms = ["facebook", "instagram", "whatsapp", "tiktok", "tinder", "snapchat", "wechat"] + + while True: + print("\nSelect the social media platform you want to analyze:") + for i, p in enumerate(platforms, 1): + print(f"{i}. {p.capitalize()}") + + try: + choice = int(input(f"Enter your choice (1-{len(platforms)}): ")) + if 1 <= choice <= len(platforms): + platform = platforms[choice - 1] + break + else: + print("Invalid choice. Please try again.") + except ValueError: + print("Invalid input. Please enter a number.") + + while True: + print(f"\nWhat do you want to do for {platform.capitalize()}?") + print("1. Analyze a profile for signs of being fake.") + print("2. Analyze a message for phishing or scam attempts.") + + try: + analysis_choice = int(input("Enter your choice (1-2): ")) + if analysis_choice == 1: + profile_url = input(f"Enter the {platform.capitalize()} profile URL to analyze: ").strip() + if profile_url: + fake_profile_detector.analyze_profile_based_on_user_input(profile_url, platform) + else: + print("No profile URL entered.") + break + elif analysis_choice == 2: + message = input("Paste the message you want to analyze: ").strip() + if message: + result = scam_detector.analyze_text_for_scams(message, platform) + print("\n--- Scam Analysis Results ---") + print(f"Score: {result['score']} (Higher is more suspicious)") + print("Indicators Found:") + if result['indicators_found']: + for indicator in result['indicators_found']: + print(f"- {indicator}") + else: + print("No specific scam indicators were found.") + else: + print("No message entered.") + break + else: + print("Invalid choice. Please try again.") + except ValueError: + print("Invalid input. Please enter a number.") + +if __name__ == '__main__': + main() diff --git a/social_media_analyzer/scam_detector.py b/social_media_analyzer/scam_detector.py new file mode 100644 index 0000000..d23ba66 --- /dev/null +++ b/social_media_analyzer/scam_detector.py @@ -0,0 +1,151 @@ +import re +from urllib.parse import urlparse +from .heuristics import ( + URGENCY_KEYWORDS, + SENSITIVE_INFO_KEYWORDS, + TOO_GOOD_TO_BE_TRUE_KEYWORDS, + GENERIC_GREETINGS, + TECH_SUPPORT_SCAM_KEYWORDS, + PAYMENT_KEYWORDS, + URL_PATTERN, + SUSPICIOUS_TLDS, + CRYPTO_ADDRESS_PATTERNS, + PHONE_NUMBER_PATTERN, + HEURISTIC_WEIGHTS, + LEGITIMATE_DOMAINS, + SUSPICIOUS_URL_PATTERNS +) + +def get_legitimate_domains(platform=None): + """ + Returns a list of legitimate domains for a given platform, + including general safe domains. + """ + domains = set(LEGITIMATE_DOMAINS.get("general", [])) + if platform and platform in LEGITIMATE_DOMAINS: + domains.update(LEGITIMATE_DOMAINS[platform]) + return list(domains) + +def get_domain_from_url(url): + """Extracts the domain (e.g., 'example.com') from a URL.""" + if "://" in url: + domain = url.split("://")[1].split("/")[0].split("?")[0] + else: + domain = url.split("/")[0].split("?")[0] + return domain.lower() + +def is_url_suspicious(url, platform=None): + """ + Checks if a URL is suspicious based on various patterns and lists. + Returns a tuple: (bool_is_suspicious, reason_string) + """ + normalized_url = url.lower() + domain = get_domain_from_url(url) + legitimate_domains = get_legitimate_domains(platform) + + # 1. Check if the domain is in the legitimate list for the platform + if domain in legitimate_domains: + # Still check for impersonation patterns that might include the legit domain + for pattern in SUSPICIOUS_URL_PATTERNS: + if re.search(pattern, normalized_url, re.IGNORECASE): + if not domain.endswith(tuple(legitimate_domains)): + return True, f"URL impersonates a legitimate domain: {pattern}" + return False, "URL domain is on the legitimate list." + + # 2. Check against known suspicious patterns + for pattern in SUSPICIOUS_URL_PATTERNS: + if re.search(pattern, normalized_url, re.IGNORECASE): + return True, f"URL matches suspicious pattern: {pattern}" + + # 3. Check for suspicious TLDs + suspicious_tld_regex = re.compile(r"\.(" + "|".join(tld.lstrip('.') for tld in SUSPICIOUS_TLDS) + r")$", re.IGNORECASE) + if suspicious_tld_regex.search(domain): + return True, f"URL uses a potentially suspicious TLD." + + # 4. Check if a known legitimate service name is part of the domain, but it's not official + for service in LEGITIMATE_DOMAINS.keys(): + if service != "general" and service in domain: + return True, f"URL contains the name of a legitimate service ('{service}') but is not an official domain." + + return False, "URL does not match common suspicious patterns." + +def analyze_text_for_scams(text_content, platform=None): + """ + Analyzes a block of text content for various scam indicators. + """ + if not text_content: + return {"score": 0.0, "indicators_found": [], "urls_analyzed": []} + + text_lower = text_content.lower() + score = 0.0 + indicators_found = [] + urls_analyzed_details = [] + + # 1. Keyword-based checks + keyword_checks = { + "URGENCY": URGENCY_KEYWORDS, + "SENSITIVE_INFO": SENSITIVE_INFO_KEYWORDS, + "TOO_GOOD_TO_BE_TRUE": TOO_GOOD_TO_BE_TRUE_KEYWORDS, + "GENERIC_GREETING": GENERIC_GREETINGS, + "TECH_SUPPORT": TECH_SUPPORT_SCAM_KEYWORDS, + "PAYMENT_REQUEST": PAYMENT_KEYWORDS, + } + + for category, keywords in keyword_checks.items(): + for keyword in keywords: + if keyword in text_lower: + message = f"Presence of '{category.replace('_', ' ').title()}' keyword: '{keyword}'" + if message not in indicators_found: + indicators_found.append(message) + score += HEURISTIC_WEIGHTS.get(category, 1.0) + + # 2. Regex-based checks + found_urls = URL_PATTERN.findall(text_content) + for url_str in found_urls: + is_susp, reason = is_url_suspicious(url_str, platform) + url_analysis = {"url": url_str, "is_suspicious": is_susp, "reason": reason} + if is_susp: + score += HEURISTIC_WEIGHTS.get("SUSPICIOUS_URL_PATTERN", 3.0) + indicators_found.append(f"Suspicious URL found: {url_str} (Reason: {reason})") + urls_analyzed_details.append(url_analysis) + + # 3. Crypto Addresses + for crypto_name, pattern in CRYPTO_ADDRESS_PATTERNS.items(): + if pattern.search(text_content): + message = f"Potential {crypto_name} cryptocurrency address found." + if message not in indicators_found: + indicators_found.append(message) + score += HEURISTIC_WEIGHTS.get("CRYPTO_ADDRESS", 2.5) + + # 4. Phone Numbers + if PHONE_NUMBER_PATTERN.search(text_content): + message = "Phone number detected in text." + if message not in indicators_found: + indicators_found.append(message) + score += HEURISTIC_WEIGHTS.get("PHONE_NUMBER_UNSOLICITED", 1.0) + + return { + "score": round(score, 2), + "indicators_found": indicators_found, + "urls_analyzed": urls_analyzed_details + } + +if __name__ == '__main__': + # Example Usage + test_message = "URGENT: Your Instagram account has unusual activity. Please verify your account now by clicking http://instagram.security-update.com/login to avoid suspension." + analysis_result = analyze_text_for_scams(test_message, platform="instagram") + print(f"--- Analyzing Instagram Scam Message ---") + print(f"Message: {test_message}") + print(f"Score: {analysis_result['score']}") + print("Indicators:") + for indicator in analysis_result['indicators_found']: + print(f" - {indicator}") + + test_message_whatsapp = "Hey, check out this link: http://wa.me/1234567890. Also, please send money to my bitcoin wallet 1A1zP1eP5QGefi2DMPTfTL5SLmv7DivfNa" + analysis_result_whatsapp = analyze_text_for_scams(test_message_whatsapp, platform="whatsapp") + print(f"\n--- Analyzing WhatsApp Message ---") + print(f"Message: {test_message_whatsapp}") + print(f"Score: {analysis_result_whatsapp['score']}") + print("Indicators:") + for indicator in analysis_result_whatsapp['indicators_found']: + print(f" - {indicator}")