diff --git a/lib/mixpanel-ruby.rb b/lib/mixpanel-ruby.rb index 79df69c..fd99531 100644 --- a/lib/mixpanel-ruby.rb +++ b/lib/mixpanel-ruby.rb @@ -6,3 +6,6 @@ require 'mixpanel-ruby/flags/flags_provider.rb' require 'mixpanel-ruby/flags/local_flags_provider.rb' require 'mixpanel-ruby/flags/remote_flags_provider.rb' +require 'mixpanel-ruby/ai_bot_classifier' +require 'mixpanel-ruby/ai_bot_properties' +require 'mixpanel-ruby/middleware/ai_bot_classifier' diff --git a/lib/mixpanel-ruby/ai_bot_classifier.rb b/lib/mixpanel-ruby/ai_bot_classifier.rb new file mode 100644 index 0000000..4490008 --- /dev/null +++ b/lib/mixpanel-ruby/ai_bot_classifier.rb @@ -0,0 +1,145 @@ +# lib/mixpanel-ruby/ai_bot_classifier.rb + +module Mixpanel + module AiBotClassifier + BOT_DATABASE = [ + { + pattern: /GPTBot\//i, + name: 'GPTBot', + provider: 'OpenAI', + category: 'indexing', + description: 'OpenAI web crawler for model training data', + }, + { + pattern: /ChatGPT-User\//i, + name: 'ChatGPT-User', + provider: 'OpenAI', + category: 'retrieval', + description: 'ChatGPT real-time retrieval for user queries (RAG)', + }, + { + pattern: /OAI-SearchBot\//i, + name: 'OAI-SearchBot', + provider: 'OpenAI', + category: 'indexing', + description: 'OpenAI search indexing crawler', + }, + { + pattern: /ClaudeBot\//i, + name: 'ClaudeBot', + provider: 'Anthropic', + category: 'indexing', + description: 'Anthropic web crawler for model training', + }, + { + pattern: /Claude-User\//i, + name: 'Claude-User', + provider: 'Anthropic', + category: 'retrieval', + description: 'Claude real-time retrieval for user queries', + }, + { + pattern: /Google-Extended\//i, + name: 'Google-Extended', + provider: 'Google', + category: 'indexing', + description: 'Google AI training data crawler', + }, + { + pattern: /PerplexityBot\//i, + name: 'PerplexityBot', + provider: 'Perplexity', + category: 'retrieval', + description: 'Perplexity AI search crawler', + }, + { + pattern: /Bytespider\//i, + name: 'Bytespider', + provider: 'ByteDance', + category: 'indexing', + description: 'ByteDance/TikTok AI crawler', + }, + { + pattern: /CCBot\//i, + name: 'CCBot', + provider: 'Common Crawl', + category: 'indexing', + description: 'Common Crawl bot', + }, + { + pattern: /Applebot-Extended\//i, + name: 'Applebot-Extended', + provider: 'Apple', + category: 'indexing', + description: 'Apple AI/Siri training data crawler', + }, + { + pattern: /Meta-ExternalAgent\//i, + name: 'Meta-ExternalAgent', + provider: 'Meta', + category: 'indexing', + description: 'Meta/Facebook AI training data crawler', + }, + { + pattern: /cohere-ai\//i, + name: 'cohere-ai', + provider: 'Cohere', + category: 'indexing', + description: 'Cohere AI training data crawler', + }, + ].freeze + + # Classify a user-agent string against the AI bot database. + # + # @param user_agent [String, nil] The user-agent string to classify + # @return [Hash] Classification result with :is_ai_bot and optional :bot_name, :provider, :category + def self.classify(user_agent) + return { is_ai_bot: false } if user_agent.nil? || user_agent.empty? + + BOT_DATABASE.each do |bot| + if bot[:pattern].match?(user_agent) + return { + is_ai_bot: true, + bot_name: bot[:name], + provider: bot[:provider], + category: bot[:category], + } + end + end + + { is_ai_bot: false } + end + + # Return a copy of the bot database for inspection. + # + # @return [Array] Array of bot entries + def self.bot_database + BOT_DATABASE.map { |bot| bot.slice(:name, :provider, :category, :description) } + end + + # Create a classifier with optional additional bot patterns. + # + # @param additional_bots [Array] Additional bot patterns (checked first) + # @return [Proc] A classifier proc that accepts a user-agent string + def self.create_classifier(additional_bots: []) + combined = additional_bots + BOT_DATABASE + + ->(user_agent) { + return { is_ai_bot: false } if user_agent.nil? || user_agent.empty? + + combined.each do |bot| + if bot[:pattern].match?(user_agent) + return { + is_ai_bot: true, + bot_name: bot[:name], + provider: bot[:provider], + category: bot[:category], + } + end + end + + { is_ai_bot: false } + } + end + end +end diff --git a/lib/mixpanel-ruby/ai_bot_properties.rb b/lib/mixpanel-ruby/ai_bot_properties.rb new file mode 100644 index 0000000..ccb9fdd --- /dev/null +++ b/lib/mixpanel-ruby/ai_bot_properties.rb @@ -0,0 +1,51 @@ +# lib/mixpanel-ruby/ai_bot_properties.rb + +require_relative 'ai_bot_classifier' + +module Mixpanel + # Module mixin for Tracker that enriches track() calls with AI bot + # classification properties. + # + # Uses two sources (in priority order): + # 1. '$user_agent' property in the event properties (direct classification) + # 2. Thread.current[:mixpanel_bot_classification] (from Rack middleware) + # + # Usage: + # tracker = Mixpanel::Tracker.new(token) + # tracker.extend(Mixpanel::AiBotProperties) + # tracker.track(distinct_id, event, {'$user_agent' => request.user_agent}) + # + module AiBotProperties + def track(distinct_id, event, properties = {}, ip = nil) + classification = nil + + # Priority 1: Classify from $user_agent property + if properties['$user_agent'] + classification = AiBotClassifier.classify(properties['$user_agent']) + # Priority 2: Use thread-local from Rack middleware + elsif Thread.current[:mixpanel_bot_classification] + classification = Thread.current[:mixpanel_bot_classification] + end + + if classification + properties = properties.merge( + classification_to_properties(classification) + ) + end + + super(distinct_id, event, properties, ip) + end + + private + + def classification_to_properties(classification) + props = { '$is_ai_bot' => classification[:is_ai_bot] } + if classification[:is_ai_bot] + props['$ai_bot_name'] = classification[:bot_name] + props['$ai_bot_provider'] = classification[:provider] + props['$ai_bot_category'] = classification[:category] + end + props + end + end +end diff --git a/lib/mixpanel-ruby/middleware/ai_bot_classifier.rb b/lib/mixpanel-ruby/middleware/ai_bot_classifier.rb new file mode 100644 index 0000000..d47a163 --- /dev/null +++ b/lib/mixpanel-ruby/middleware/ai_bot_classifier.rb @@ -0,0 +1,64 @@ +# lib/mixpanel-ruby/middleware/ai_bot_classifier.rb + +require_relative '../ai_bot_classifier' + +module Mixpanel + module Middleware + # Rack middleware that classifies incoming HTTP requests for AI bot + # detection and stores the result for downstream Mixpanel tracking. + # + # Classification is stored in: + # - env['mixpanel.bot_classification'] (Rack convention) + # - Thread.current[:mixpanel_bot_classification] (for Tracker access) + # + # Usage: + # # In config.ru: + # use Mixpanel::Middleware::AiBotClassifier + # + # # In Rails application.rb: + # config.middleware.use Mixpanel::Middleware::AiBotClassifier + # + class AiBotClassifier + def initialize(app, options = {}) + @app = app + @classifier = if options[:additional_bots] + Mixpanel::AiBotClassifier.create_classifier( + additional_bots: options[:additional_bots] + ) + else + Mixpanel::AiBotClassifier.method(:classify) + end + end + + def call(env) + user_agent = env['HTTP_USER_AGENT'] + ip = extract_ip(env) + + classification = @classifier.call(user_agent) + + classification[:ip] = ip + classification[:user_agent] = user_agent + + env['mixpanel.bot_classification'] = classification + Thread.current[:mixpanel_bot_classification] = classification + + begin + @app.call(env) + ensure + Thread.current[:mixpanel_bot_classification] = nil + end + end + + private + + def extract_ip(env) + forwarded = env['HTTP_X_FORWARDED_FOR'] + if forwarded && !forwarded.empty? + forwarded.split(',').first.strip + else + env['REMOTE_ADDR'] + end + end + end + end +end diff --git a/mixpanel-ruby.gemspec b/mixpanel-ruby.gemspec index a04e617..350c511 100644 --- a/mixpanel-ruby.gemspec +++ b/mixpanel-ruby.gemspec @@ -25,4 +25,5 @@ spec = Gem::Specification.new do |spec| spec.add_development_dependency 'ruby-lsp-rspec' spec.add_development_dependency 'simplecov' spec.add_development_dependency 'simplecov-cobertura' + spec.add_development_dependency 'rack' end diff --git a/spec/mixpanel-ruby/ai_bot_classifier_spec.rb b/spec/mixpanel-ruby/ai_bot_classifier_spec.rb new file mode 100644 index 0000000..fe90820 --- /dev/null +++ b/spec/mixpanel-ruby/ai_bot_classifier_spec.rb @@ -0,0 +1,246 @@ +# spec/mixpanel-ruby/ai_bot_classifier_spec.rb +require 'spec_helper' +require 'mixpanel-ruby/ai_bot_classifier' + +describe Mixpanel::AiBotClassifier do + + describe '.classify' do + + # === OpenAI Bots === + + it 'classifies GPTBot user agent' do + result = described_class.classify( + 'Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; ' \ + 'GPTBot/1.2; +https://openai.com/gptbot)' + ) + expect(result[:is_ai_bot]).to be true + expect(result[:bot_name]).to eq('GPTBot') + expect(result[:provider]).to eq('OpenAI') + expect(result[:category]).to eq('indexing') + end + + it 'classifies ChatGPT-User agent' do + result = described_class.classify( + 'Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; ' \ + 'ChatGPT-User/1.0; +https://openai.com/bot)' + ) + expect(result[:is_ai_bot]).to be true + expect(result[:bot_name]).to eq('ChatGPT-User') + expect(result[:provider]).to eq('OpenAI') + expect(result[:category]).to eq('retrieval') + end + + it 'classifies OAI-SearchBot agent' do + result = described_class.classify( + 'Mozilla/5.0 (compatible; OAI-SearchBot/1.0; +https://openai.com/searchbot)' + ) + expect(result[:is_ai_bot]).to be true + expect(result[:bot_name]).to eq('OAI-SearchBot') + expect(result[:provider]).to eq('OpenAI') + expect(result[:category]).to eq('indexing') + end + + # === Anthropic Bots === + + it 'classifies ClaudeBot agent' do + result = described_class.classify( + 'Mozilla/5.0 (compatible; ClaudeBot/1.0; +claudebot@anthropic.com)' + ) + expect(result[:is_ai_bot]).to be true + expect(result[:bot_name]).to eq('ClaudeBot') + expect(result[:provider]).to eq('Anthropic') + expect(result[:category]).to eq('indexing') + end + + it 'classifies Claude-User agent' do + result = described_class.classify('Mozilla/5.0 (compatible; Claude-User/1.0)') + expect(result[:is_ai_bot]).to be true + expect(result[:bot_name]).to eq('Claude-User') + expect(result[:provider]).to eq('Anthropic') + expect(result[:category]).to eq('retrieval') + end + + # === Google === + + it 'classifies Google-Extended agent' do + result = described_class.classify('Mozilla/5.0 (compatible; Google-Extended/1.0)') + expect(result[:is_ai_bot]).to be true + expect(result[:bot_name]).to eq('Google-Extended') + expect(result[:provider]).to eq('Google') + expect(result[:category]).to eq('indexing') + end + + # === Perplexity === + + it 'classifies PerplexityBot agent' do + result = described_class.classify('Mozilla/5.0 (compatible; PerplexityBot/1.0)') + expect(result[:is_ai_bot]).to be true + expect(result[:bot_name]).to eq('PerplexityBot') + expect(result[:provider]).to eq('Perplexity') + expect(result[:category]).to eq('retrieval') + end + + # === ByteDance === + + it 'classifies Bytespider agent' do + result = described_class.classify('Mozilla/5.0 (compatible; Bytespider/1.0)') + expect(result[:is_ai_bot]).to be true + expect(result[:bot_name]).to eq('Bytespider') + expect(result[:provider]).to eq('ByteDance') + end + + # === Common Crawl === + + it 'classifies CCBot agent' do + result = described_class.classify('CCBot/2.0 (https://commoncrawl.org/faq/)') + expect(result[:is_ai_bot]).to be true + expect(result[:bot_name]).to eq('CCBot') + expect(result[:provider]).to eq('Common Crawl') + end + + # === Apple === + + it 'classifies Applebot-Extended agent' do + result = described_class.classify( + 'Mozilla/5.0 (Macintosh; Intel Mac OS X) ' \ + 'AppleWebKit/605.1.15 (KHTML, like Gecko) Applebot-Extended/0.1' + ) + expect(result[:is_ai_bot]).to be true + expect(result[:bot_name]).to eq('Applebot-Extended') + expect(result[:provider]).to eq('Apple') + end + + # === Meta === + + it 'classifies Meta-ExternalAgent' do + result = described_class.classify('Mozilla/5.0 (compatible; Meta-ExternalAgent/1.0)') + expect(result[:is_ai_bot]).to be true + expect(result[:bot_name]).to eq('Meta-ExternalAgent') + expect(result[:provider]).to eq('Meta') + end + + # === Cohere === + + it 'classifies cohere-ai agent' do + result = described_class.classify('cohere-ai/1.0 (https://cohere.com)') + expect(result[:is_ai_bot]).to be true + expect(result[:bot_name]).to eq('cohere-ai') + expect(result[:provider]).to eq('Cohere') + expect(result[:category]).to eq('indexing') + end + + # === NEGATIVE CASES === + + it 'does not classify regular Chrome as AI bot' do + result = described_class.classify( + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 ' \ + '(KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36' + ) + expect(result[:is_ai_bot]).to be false + expect(result[:bot_name]).to be_nil + end + + it 'does not classify regular Googlebot as AI bot' do + result = described_class.classify( + 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)' + ) + expect(result[:is_ai_bot]).to be false + end + + it 'does not classify regular Bingbot as AI bot' do + result = described_class.classify( + 'Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)' + ) + expect(result[:is_ai_bot]).to be false + end + + it 'does not classify curl as AI bot' do + result = described_class.classify('curl/7.64.1') + expect(result[:is_ai_bot]).to be false + end + + it 'handles empty string' do + result = described_class.classify('') + expect(result[:is_ai_bot]).to be false + end + + it 'handles nil' do + result = described_class.classify(nil) + expect(result[:is_ai_bot]).to be false + end + + # === CASE SENSITIVITY === + + it 'matches case-insensitively' do + result = described_class.classify('mozilla/5.0 (compatible; gptbot/1.2)') + expect(result[:is_ai_bot]).to be true + expect(result[:bot_name]).to eq('GPTBot') + end + + # === RETURN SHAPE === + + it 'returns all expected fields for a match' do + result = described_class.classify('GPTBot/1.2') + expect(result).to have_key(:is_ai_bot) + expect(result).to have_key(:bot_name) + expect(result).to have_key(:provider) + expect(result).to have_key(:category) + expect(%w[indexing retrieval agent]).to include(result[:category]) + end + + it 'returns only is_ai_bot for non-matches' do + result = described_class.classify('Chrome/120') + expect(result.keys).to eq([:is_ai_bot]) + expect(result[:is_ai_bot]).to be false + end + end + + describe '.bot_database' do + it 'returns an array of bot entries' do + db = described_class.bot_database + expect(db).to be_an(Array) + expect(db.length).to be > 0 + end + + it 'has required fields on each entry' do + described_class.bot_database.each do |entry| + expect(entry).to have_key(:name) + expect(entry).to have_key(:provider) + expect(entry).to have_key(:category) + end + end + end + + describe '.create_classifier' do + it 'allows adding custom bot patterns' do + classifier = described_class.create_classifier( + additional_bots: [ + { + pattern: /MyCustomBot\//i, + name: 'MyCustomBot', + provider: 'CustomCorp', + category: 'indexing', + } + ] + ) + result = classifier.call('Mozilla/5.0 (compatible; MyCustomBot/1.0)') + expect(result[:is_ai_bot]).to be true + expect(result[:bot_name]).to eq('MyCustomBot') + end + + it 'checks custom bots before built-in bots' do + classifier = described_class.create_classifier( + additional_bots: [ + { + pattern: /GPTBot\//i, + name: 'GPTBot-Custom', + provider: 'CustomProvider', + category: 'retrieval', + } + ] + ) + result = classifier.call('GPTBot/1.2') + expect(result[:bot_name]).to eq('GPTBot-Custom') + end + end +end diff --git a/spec/mixpanel-ruby/ai_bot_properties_spec.rb b/spec/mixpanel-ruby/ai_bot_properties_spec.rb new file mode 100644 index 0000000..9a4ecee --- /dev/null +++ b/spec/mixpanel-ruby/ai_bot_properties_spec.rb @@ -0,0 +1,162 @@ +# spec/mixpanel-ruby/ai_bot_properties_spec.rb +require 'spec_helper' +require 'mixpanel-ruby' +require 'mixpanel-ruby/ai_bot_properties' +require 'mixpanel-ruby/ai_bot_classifier' + +describe Mixpanel::AiBotProperties do + before(:each) do + @time_now = Time.parse('Jun 6 1972, 16:23:04') + allow(Time).to receive(:now).and_return(@time_now) + + @log = [] + @tracker = Mixpanel::Tracker.new('TEST TOKEN') do |type, message| + @log << [type, JSON.load(message)] + end + @tracker.extend(Mixpanel::AiBotProperties) + end + + after(:each) do + Thread.current[:mixpanel_bot_classification] = nil + end + + describe '#track with $user_agent property' do + it 'enriches events when $user_agent identifies an AI bot' do + @tracker.track('user123', 'page_view', { + '$user_agent' => 'Mozilla/5.0 (compatible; GPTBot/1.2; +https://openai.com/gptbot)', + }) + + expect(@log.length).to eq(1) + type, message = @log[0] + props = message['data']['properties'] + + expect(props['$is_ai_bot']).to be true + expect(props['$ai_bot_name']).to eq('GPTBot') + expect(props['$ai_bot_provider']).to eq('OpenAI') + expect(props['$ai_bot_category']).to eq('indexing') + end + + it 'sets $is_ai_bot false for non-AI user agents' do + @tracker.track('user123', 'page_view', { + '$user_agent' => 'Mozilla/5.0 Chrome/120.0.0.0', + }) + + props = @log[0][1]['data']['properties'] + expect(props['$is_ai_bot']).to be false + expect(props['$ai_bot_name']).to be_nil + end + + it 'does not add classification when $user_agent is absent' do + @tracker.track('user123', 'page_view', { 'page' => '/home' }) + + props = @log[0][1]['data']['properties'] + expect(props).not_to have_key('$is_ai_bot') + end + + it 'preserves existing properties' do + @tracker.track('user123', 'page_view', { + '$user_agent' => 'GPTBot/1.2', + 'page_url' => '/products', + 'custom_prop' => 'value', + }) + + props = @log[0][1]['data']['properties'] + expect(props['page_url']).to eq('/products') + expect(props['custom_prop']).to eq('value') + expect(props['$is_ai_bot']).to be true + end + + it 'preserves SDK default properties' do + @tracker.track('user123', 'page_view', { + '$user_agent' => 'GPTBot/1.2', + }) + + props = @log[0][1]['data']['properties'] + expect(props['token']).to eq('TEST TOKEN') + expect(props['distinct_id']).to eq('user123') + expect(props['mp_lib']).to eq('ruby') + expect(props['$lib_version']).to eq(Mixpanel::VERSION) + end + + it 'returns true on success (matches existing track behavior)' do + result = @tracker.track('user123', 'page_view', { + '$user_agent' => 'GPTBot/1.2', + }) + expect(result).to be true + end + + it 'passes through ip parameter' do + @tracker.track('user123', 'page_view', { + '$user_agent' => 'GPTBot/1.2', + }, '1.2.3.4') + + props = @log[0][1]['data']['properties'] + expect(props['ip']).to eq('1.2.3.4') + expect(props['$is_ai_bot']).to be true + end + end + + describe '#track with Thread.current[:mixpanel_bot_classification]' do + it 'uses thread-local classification when available' do + Thread.current[:mixpanel_bot_classification] = { + is_ai_bot: true, + bot_name: 'GPTBot', + provider: 'OpenAI', + category: 'indexing', + } + + @tracker.track('user123', 'page_view', { 'page' => '/home' }) + + props = @log[0][1]['data']['properties'] + expect(props['$is_ai_bot']).to be true + expect(props['$ai_bot_name']).to eq('GPTBot') + end + + it 'prefers $user_agent property over thread-local when both present' do + Thread.current[:mixpanel_bot_classification] = { + is_ai_bot: true, + bot_name: 'GPTBot', + provider: 'OpenAI', + category: 'indexing', + } + + @tracker.track('user123', 'page_view', { + '$user_agent' => 'ClaudeBot/1.0', + }) + + props = @log[0][1]['data']['properties'] + # $user_agent classification should take priority + expect(props['$ai_bot_name']).to eq('ClaudeBot') + end + + it 'adds non-bot classification from thread-local' do + Thread.current[:mixpanel_bot_classification] = { + is_ai_bot: false, + } + + @tracker.track('user123', 'page_view', { 'page' => '/home' }) + + props = @log[0][1]['data']['properties'] + expect(props['$is_ai_bot']).to be false + end + end + + describe 'multiple bot types' do + it 'correctly classifies different bots in sequence' do + bots = [ + ['GPTBot/1.2', 'GPTBot', 'OpenAI'], + ['ClaudeBot/1.0', 'ClaudeBot', 'Anthropic'], + ['PerplexityBot/1.0', 'PerplexityBot', 'Perplexity'], + ] + + bots.each do |ua, name, provider| + @log.clear + @tracker.track('user123', 'page_view', { '$user_agent' => ua }) + props = @log[0][1]['data']['properties'] + expect(props['$is_ai_bot']).to be(true), "Failed for #{ua}" + expect(props['$ai_bot_name']).to eq(name), "Wrong name for #{ua}" + expect(props['$ai_bot_provider']).to eq(provider), "Wrong provider for #{ua}" + end + end + end +end diff --git a/spec/mixpanel-ruby/middleware/ai_bot_classifier_spec.rb b/spec/mixpanel-ruby/middleware/ai_bot_classifier_spec.rb new file mode 100644 index 0000000..af614cf --- /dev/null +++ b/spec/mixpanel-ruby/middleware/ai_bot_classifier_spec.rb @@ -0,0 +1,134 @@ +# spec/mixpanel-ruby/middleware/ai_bot_classifier_spec.rb +require 'spec_helper' +require 'rack' +require 'mixpanel-ruby/middleware/ai_bot_classifier' +require 'mixpanel-ruby/ai_bot_classifier' + +describe Mixpanel::Middleware::AiBotClassifier do + let(:inner_app) { ->(env) { [200, {}, ['OK']] } } + let(:middleware) { described_class.new(inner_app) } + + after(:each) do + Thread.current[:mixpanel_bot_classification] = nil + end + + def make_request(user_agent: nil, remote_addr: '127.0.0.1') + env = Rack::MockRequest.env_for('http://example.com/test', { + 'HTTP_USER_AGENT' => user_agent, + 'REMOTE_ADDR' => remote_addr, + }) + middleware.call(env) + env + end + + describe 'request classification' do + it 'classifies AI bot requests and stores in env' do + env = make_request(user_agent: 'GPTBot/1.2') + classification = env['mixpanel.bot_classification'] + + expect(classification).not_to be_nil + expect(classification[:is_ai_bot]).to be true + expect(classification[:bot_name]).to eq('GPTBot') + expect(classification[:provider]).to eq('OpenAI') + end + + it 'classifies non-AI requests' do + env = make_request(user_agent: 'Mozilla/5.0 Chrome/120') + classification = env['mixpanel.bot_classification'] + + expect(classification[:is_ai_bot]).to be false + end + + it 'stores classification in Thread.current' do + captured_classification = nil + + app = ->(env) { + captured_classification = Thread.current[:mixpanel_bot_classification] + [200, {}, ['OK']] + } + mw = described_class.new(app) + + env = Rack::MockRequest.env_for('/', { + 'HTTP_USER_AGENT' => 'GPTBot/1.2', + }) + mw.call(env) + + expect(captured_classification).not_to be_nil + expect(captured_classification[:is_ai_bot]).to be true + expect(captured_classification[:bot_name]).to eq('GPTBot') + end + + it 'cleans up Thread.current after request' do + env = make_request(user_agent: 'GPTBot/1.2') + expect(Thread.current[:mixpanel_bot_classification]).to be_nil + end + + it 'cleans up Thread.current even if app raises' do + app = ->(env) { raise RuntimeError, 'boom' } + mw = described_class.new(app) + + env = Rack::MockRequest.env_for('/', { + 'HTTP_USER_AGENT' => 'GPTBot/1.2', + }) + + expect { mw.call(env) }.to raise_error(RuntimeError) + expect(Thread.current[:mixpanel_bot_classification]).to be_nil + end + + it 'handles missing User-Agent header' do + env = make_request(user_agent: nil) + classification = env['mixpanel.bot_classification'] + + expect(classification[:is_ai_bot]).to be false + end + + it 'stores IP address in classification' do + env = make_request( + user_agent: 'GPTBot/1.2', + remote_addr: '1.2.3.4', + ) + classification = env['mixpanel.bot_classification'] + + expect(classification[:ip]).to eq('1.2.3.4') + end + + it 'extracts IP from X-Forwarded-For when present' do + env = Rack::MockRequest.env_for('/', { + 'HTTP_USER_AGENT' => 'GPTBot/1.2', + 'HTTP_X_FORWARDED_FOR' => '5.6.7.8, 9.10.11.12', + 'REMOTE_ADDR' => '127.0.0.1', + }) + middleware.call(env) + classification = env['mixpanel.bot_classification'] + + expect(classification[:ip]).to eq('5.6.7.8') + end + end + + describe 'passthrough behavior' do + it 'passes the request through to the inner app' do + status, _headers, body = middleware.call( + Rack::MockRequest.env_for('/', { + 'HTTP_USER_AGENT' => 'GPTBot/1.2', + }) + ) + expect(status).to eq(200) + expect(body).to eq(['OK']) + end + + it 'does not modify the response' do + app = ->(env) { [201, { 'X-Custom' => 'value' }, ['Created']] } + mw = described_class.new(app) + + status, headers, body = mw.call( + Rack::MockRequest.env_for('/', { + 'HTTP_USER_AGENT' => 'Chrome/120', + }) + ) + + expect(status).to eq(201) + expect(headers['X-Custom']).to eq('value') + expect(body).to eq(['Created']) + end + end +end