Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions lib/mixpanel-ruby.rb
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,6 @@
require 'mixpanel-ruby/flags/flags_provider.rb'
require 'mixpanel-ruby/flags/local_flags_provider.rb'
require 'mixpanel-ruby/flags/remote_flags_provider.rb'
require 'mixpanel-ruby/ai_bot_classifier'
require 'mixpanel-ruby/ai_bot_properties'
require 'mixpanel-ruby/middleware/ai_bot_classifier'
145 changes: 145 additions & 0 deletions lib/mixpanel-ruby/ai_bot_classifier.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
# lib/mixpanel-ruby/ai_bot_classifier.rb

module Mixpanel
module AiBotClassifier
BOT_DATABASE = [
{
pattern: /GPTBot\//i,
name: 'GPTBot',
provider: 'OpenAI',
category: 'indexing',
description: 'OpenAI web crawler for model training data',
},
{
pattern: /ChatGPT-User\//i,
name: 'ChatGPT-User',
provider: 'OpenAI',
category: 'retrieval',
description: 'ChatGPT real-time retrieval for user queries (RAG)',
},
{
pattern: /OAI-SearchBot\//i,
name: 'OAI-SearchBot',
provider: 'OpenAI',
category: 'indexing',
description: 'OpenAI search indexing crawler',
},
{
pattern: /ClaudeBot\//i,
name: 'ClaudeBot',
provider: 'Anthropic',
category: 'indexing',
description: 'Anthropic web crawler for model training',
},
{
pattern: /Claude-User\//i,
name: 'Claude-User',
provider: 'Anthropic',
category: 'retrieval',
description: 'Claude real-time retrieval for user queries',
},
{
pattern: /Google-Extended\//i,
name: 'Google-Extended',
provider: 'Google',
category: 'indexing',
description: 'Google AI training data crawler',
},
{
pattern: /PerplexityBot\//i,
name: 'PerplexityBot',
provider: 'Perplexity',
category: 'retrieval',
description: 'Perplexity AI search crawler',
},
{
pattern: /Bytespider\//i,
name: 'Bytespider',
provider: 'ByteDance',
category: 'indexing',
description: 'ByteDance/TikTok AI crawler',
},
{
pattern: /CCBot\//i,
name: 'CCBot',
provider: 'Common Crawl',
category: 'indexing',
description: 'Common Crawl bot',
},
{
pattern: /Applebot-Extended\//i,
name: 'Applebot-Extended',
provider: 'Apple',
category: 'indexing',
description: 'Apple AI/Siri training data crawler',
},
{
pattern: /Meta-ExternalAgent\//i,
name: 'Meta-ExternalAgent',
provider: 'Meta',
category: 'indexing',
description: 'Meta/Facebook AI training data crawler',
},
{
pattern: /cohere-ai\//i,
name: 'cohere-ai',
provider: 'Cohere',
category: 'indexing',
description: 'Cohere AI training data crawler',
},
].freeze

# Classify a user-agent string against the AI bot database.
#
# @param user_agent [String, nil] The user-agent string to classify
# @return [Hash] Classification result with :is_ai_bot and optional :bot_name, :provider, :category
def self.classify(user_agent)
return { is_ai_bot: false } if user_agent.nil? || user_agent.empty?

BOT_DATABASE.each do |bot|
if bot[:pattern].match?(user_agent)
return {
is_ai_bot: true,
bot_name: bot[:name],
provider: bot[:provider],
category: bot[:category],
}
end
end

{ is_ai_bot: false }
end

# Return a copy of the bot database for inspection.
#
# @return [Array<Hash>] Array of bot entries
def self.bot_database
BOT_DATABASE.map { |bot| bot.slice(:name, :provider, :category, :description) }
end

# Create a classifier with optional additional bot patterns.
#
# @param additional_bots [Array<Hash>] Additional bot patterns (checked first)
# @return [Proc] A classifier proc that accepts a user-agent string
def self.create_classifier(additional_bots: [])
combined = additional_bots + BOT_DATABASE

->(user_agent) {
return { is_ai_bot: false } if user_agent.nil? || user_agent.empty?

combined.each do |bot|
if bot[:pattern].match?(user_agent)
return {
is_ai_bot: true,
bot_name: bot[:name],
provider: bot[:provider],
category: bot[:category],
}
end
end

{ is_ai_bot: false }
}
end
end
end
51 changes: 51 additions & 0 deletions lib/mixpanel-ruby/ai_bot_properties.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
# lib/mixpanel-ruby/ai_bot_properties.rb

require_relative 'ai_bot_classifier'

module Mixpanel
# Module mixin for Tracker that enriches track() calls with AI bot
# classification properties.
#
# Uses two sources (in priority order):
# 1. '$user_agent' property in the event properties (direct classification)
# 2. Thread.current[:mixpanel_bot_classification] (from Rack middleware)
#
# Usage:
# tracker = Mixpanel::Tracker.new(token)
# tracker.extend(Mixpanel::AiBotProperties)
# tracker.track(distinct_id, event, {'$user_agent' => request.user_agent})
#
module AiBotProperties
def track(distinct_id, event, properties = {}, ip = nil)
classification = nil

# Priority 1: Classify from $user_agent property
if properties['$user_agent']
classification = AiBotClassifier.classify(properties['$user_agent'])
# Priority 2: Use thread-local from Rack middleware
elsif Thread.current[:mixpanel_bot_classification]
classification = Thread.current[:mixpanel_bot_classification]
end

if classification
properties = properties.merge(
classification_to_properties(classification)
)
end

super(distinct_id, event, properties, ip)
end

private

def classification_to_properties(classification)
props = { '$is_ai_bot' => classification[:is_ai_bot] }
if classification[:is_ai_bot]
props['$ai_bot_name'] = classification[:bot_name]
props['$ai_bot_provider'] = classification[:provider]
props['$ai_bot_category'] = classification[:category]
end
props
end
end
end
64 changes: 64 additions & 0 deletions lib/mixpanel-ruby/middleware/ai_bot_classifier.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
# lib/mixpanel-ruby/middleware/ai_bot_classifier.rb

require_relative '../ai_bot_classifier'

module Mixpanel
module Middleware
# Rack middleware that classifies incoming HTTP requests for AI bot
# detection and stores the result for downstream Mixpanel tracking.
#
# Classification is stored in:
# - env['mixpanel.bot_classification'] (Rack convention)
# - Thread.current[:mixpanel_bot_classification] (for Tracker access)
#
# Usage:
# # In config.ru:
# use Mixpanel::Middleware::AiBotClassifier
#
# # In Rails application.rb:
# config.middleware.use Mixpanel::Middleware::AiBotClassifier
#
class AiBotClassifier
def initialize(app, options = {})
@app = app
@classifier = if options[:additional_bots]
Mixpanel::AiBotClassifier.create_classifier(
additional_bots: options[:additional_bots]
)
else
Mixpanel::AiBotClassifier.method(:classify)
end
end

def call(env)
user_agent = env['HTTP_USER_AGENT']
ip = extract_ip(env)

classification = @classifier.call(user_agent)

classification[:ip] = ip
classification[:user_agent] = user_agent

env['mixpanel.bot_classification'] = classification
Thread.current[:mixpanel_bot_classification] = classification

begin
@app.call(env)
ensure
Thread.current[:mixpanel_bot_classification] = nil
end
end

private

def extract_ip(env)
forwarded = env['HTTP_X_FORWARDED_FOR']
if forwarded && !forwarded.empty?
forwarded.split(',').first.strip
else
env['REMOTE_ADDR']
end
end
end
end
end
1 change: 1 addition & 0 deletions mixpanel-ruby.gemspec
Original file line number Diff line number Diff line change
Expand Up @@ -25,4 +25,5 @@ spec = Gem::Specification.new do |spec|
spec.add_development_dependency 'ruby-lsp-rspec'
spec.add_development_dependency 'simplecov'
spec.add_development_dependency 'simplecov-cobertura'
spec.add_development_dependency 'rack'
end
Loading