<?php
/**
 * AI crawler detection class.
 *
 * @package WPG_LLMsTxt_Manager
 */

// If this file is called directly, abort.
if (!defined('ABSPATH')) {
	exit;
}

/**
 * Crawler Detector class.
 */
class WPG_LLMsTxt_Manager_Crawler_Detector
{

	/**
	 * List of known AI crawler user agents.
	 *
	 * @var array
	 */
	private $crawler_agents = array(
		'GPTBot' => 'GPTBot',
		'ChatGPT-User' => 'ChatGPT-User',
		'CCBot' => 'CCBot',
		'ClaudeBot' => 'ClaudeBot',
		'Claude-Web' => 'Claude-Web',
		'PerplexityBot' => 'PerplexityBot',
		'Perplexity' => 'Perplexity',
		'Google-Extended' => 'Google-Extended',
		'Bingbot' => 'bingbot',
		'Applebot-Extended' => 'Applebot-Extended',
	);

	/**
	 * Detect if current request is from an AI crawler.
	 *
	 * @return array|false Crawler info or false if not a crawler.
	 */
	public function detect()
	{
		if (!get_option('wpg_llmstxt_crawler_detection', 1)) {
			return false;
		}

		$user_agent = isset($_SERVER['HTTP_USER_AGENT']) ? sanitize_text_field(wp_unslash($_SERVER['HTTP_USER_AGENT'])) : '';

		if (empty($user_agent)) {
			return false;
		}

		foreach ($this->crawler_agents as $name => $pattern) {
			if (false !== stripos($user_agent, $pattern)) {
				return array(
					'name' => $name,
					'user_agent' => $user_agent,
				);
			}
		}

		return false;
	}

	/**
	 * Log crawler visit.
	 *
	 * @param array $crawler_info Crawler information.
	 * @return bool|int False on failure, log ID on success.
	 */
	public function log_visit($crawler_info)
	{
		global $wpdb;

		$table_name = $wpdb->prefix . 'wpg_llmstxt_crawler_logs';

		$ip_address = $this->get_client_ip();
		$page_visited = isset($_SERVER['REQUEST_URI']) ? esc_url_raw(wp_unslash($_SERVER['REQUEST_URI'])) : '';

		// phpcs:ignore WordPress.DB.DirectDatabaseQuery.DirectQuery
		$result = $wpdb->insert(
			$table_name,
			array(
				'crawler_name' => sanitize_text_field($crawler_info['name']),
				'user_agent' => sanitize_text_field($crawler_info['user_agent']),
				'ip_address' => sanitize_text_field($ip_address),
				'visit_date' => current_time('mysql'),
				'page_visited' => sanitize_text_field($page_visited),
			),
			array('%s', '%s', '%s', '%s', '%s')
		);

		return $result ? $wpdb->insert_id : false;
	}

	/**
	 * Get client IP address.
	 *
	 * @return string IP address.
	 */
	private function get_client_ip()
	{
		$ip_keys = array(
			'HTTP_CF_CONNECTING_IP',
			'HTTP_X_REAL_IP',
			'HTTP_X_FORWARDED_FOR',
			'REMOTE_ADDR',
		);

		foreach ($ip_keys as $key) {
			if (!empty($_SERVER[$key])) {
				$ip = sanitize_text_field(wp_unslash($_SERVER[$key]));
				// Handle comma-separated IPs (from proxies)
				if (strpos($ip, ',') !== false) {
					$ip = trim(explode(',', $ip)[0]);
				}
				return $ip;
			}
		}

		return '0.0.0.0';
	}

	/**
	 * Get crawler statistics.
	 *
	 * @param int $days Number of days to look back.
	 * @return array Statistics.
	 */
	public function get_statistics($days = 30)
	{
		global $wpdb;

		$table_name = $wpdb->prefix . 'wpg_llmstxt_crawler_logs';
		$date_limit = gmdate('Y-m-d H:i:s', strtotime("-{$days} days"));

		// Total visits
		// phpcs:disable WordPress.DB.PreparedSQL.InterpolatedNotPrepared, WordPress.DB.DirectDatabaseQuery.DirectQuery, WordPress.DB.DirectDatabaseQuery.NoCaching, PluginCheck.Security.DirectDB.UnescapedDBParameter
		$total = $wpdb->get_var(
			$wpdb->prepare(
				"SELECT COUNT(*) FROM `{$table_name}` WHERE visit_date >= %s",
				$date_limit
			)
		);
		// phpcs:enable WordPress.DB.PreparedSQL.InterpolatedNotPrepared, WordPress.DB.DirectDatabaseQuery.DirectQuery, WordPress.DB.DirectDatabaseQuery.NoCaching, PluginCheck.Security.DirectDB.UnescapedDBParameter

		// Visits by crawler
		// phpcs:disable WordPress.DB.PreparedSQL.InterpolatedNotPrepared, WordPress.DB.DirectDatabaseQuery.DirectQuery, WordPress.DB.DirectDatabaseQuery.NoCaching, PluginCheck.Security.DirectDB.UnescapedDBParameter
		$by_crawler = $wpdb->get_results(
			$wpdb->prepare(
				"SELECT crawler_name, COUNT(*) as count FROM `{$table_name}` WHERE visit_date >= %s GROUP BY crawler_name ORDER BY count DESC",
				$date_limit
			),
			ARRAY_A
		);
		// phpcs:enable WordPress.DB.PreparedSQL.InterpolatedNotPrepared, WordPress.DB.DirectDatabaseQuery.DirectQuery, WordPress.DB.DirectDatabaseQuery.NoCaching, PluginCheck.Security.DirectDB.UnescapedDBParameter

		// Recent visits
		// phpcs:disable WordPress.DB.PreparedSQL.InterpolatedNotPrepared, WordPress.DB.DirectDatabaseQuery.DirectQuery, WordPress.DB.DirectDatabaseQuery.NoCaching, PluginCheck.Security.DirectDB.UnescapedDBParameter
		$recent = $wpdb->get_results(
			$wpdb->prepare(
				"SELECT * FROM `{$table_name}` WHERE visit_date >= %s ORDER BY visit_date DESC LIMIT 20",
				$date_limit
			),
			ARRAY_A
		);
		// phpcs:enable WordPress.DB.PreparedSQL.InterpolatedNotPrepared, WordPress.DB.DirectDatabaseQuery.DirectQuery, WordPress.DB.DirectDatabaseQuery.NoCaching, PluginCheck.Security.DirectDB.UnescapedDBParameter

		return array(
			'total' => absint($total),
			'by_crawler' => $by_crawler,
			'recent' => $recent,
		);
	}

	/**
	 * Clean old logs.
	 *
	 * @param int $days Number of days to keep.
	 * @return int|false Number of rows deleted or false on failure.
	 */
	public function clean_old_logs($days = 90)
	{
		global $wpdb;

		$table_name = $wpdb->prefix . 'wpg_llmstxt_crawler_logs';
		$date_limit = gmdate('Y-m-d H:i:s', strtotime("-{$days} days"));

		// phpcs:disable WordPress.DB.PreparedSQL.InterpolatedNotPrepared, WordPress.DB.DirectDatabaseQuery.NoCaching, WordPress.DB.DirectDatabaseQuery.DirectQuery, PluginCheck.Security.DirectDB.UnescapedDBParameter
		return $wpdb->query(
			$wpdb->prepare(
				"DELETE FROM `{$table_name}` WHERE visit_date < %s",
				$date_limit
			)
		);
		// phpcs:enable WordPress.DB.PreparedSQL.InterpolatedNotPrepared, WordPress.DB.DirectDatabaseQuery.NoCaching, WordPress.DB.DirectDatabaseQuery.DirectQuery, PluginCheck.Security.DirectDB.UnescapedDBParameter
	}
}
