<?php
namespace AIZLabs\ChatAgent;

class SiteScanner {
    public function __construct() {
        add_action('wp_ajax_aizl_scan_website', array($this, 'aizl_scan_website'));
        add_action('wp_ajax_aizl_upload_scanned_content', array($this, 'aizl_upload_scanned_content'));
        add_action('wp_ajax_aizl_download_scan_file', array($this, 'aizl_download_scan_file'));
    }

    /**
     * Serve the scanned website content file for download
     * This bypasses server restrictions on .md file access
     */
    public function aizl_download_scan_file() {
        // Verify nonce
        if (!isset($_GET['nonce']) || !wp_verify_nonce(sanitize_text_field(wp_unslash($_GET['nonce'])), 'aizl_setting')) {
            wp_die('Invalid security token.');
        }

        // Check user capability
        if (!current_user_can('manage_options')) {
            wp_die('Unauthorized access.');
        }

        $upload_dir = wp_upload_dir();
        $file_path = $upload_dir['basedir'] . '/aizlabs-chat-agent/website_content.md';

        if (!file_exists($file_path)) {
            wp_die('File not found.');
        }

        // Set headers for file download
        header('Content-Type: text/markdown; charset=utf-8');
        header('Content-Disposition: attachment; filename="website_content.md"');
        header('Content-Length: ' . filesize($file_path));
        header('Cache-Control: no-cache, must-revalidate');
        header('Pragma: no-cache');

        // Output file content
        // phpcs:ignore WordPress.WP.AlternativeFunctions.file_system_operations_readfile
        readfile($file_path);
        exit;
    }

    public function aizl_scan_website() {
        if (!check_ajax_referer('aizl_setting', 'nonce', false)) {
            wp_send_json_error('Invalid nonce.');
            return;
        }

        // Validate access key before scanning
        $db_handler = new DBHandler();
        $access_key = $db_handler->get_access_key();
        if (!$access_key) {
            wp_send_json_error('Invalid Access Key');
            return;
        }

        try {
            $markdown_content = $this->generate_website_markdown();
            $file_size = strlen($markdown_content);

            // Save local copy for review (do it server-side to avoid WAF blocking AJAX)
            $this->save_content_locally($markdown_content);

            wp_send_json_success(array(
                'content' => $markdown_content,
                'file_name' => 'website_content.md',
                'file_size' => $file_size
            ));
        } catch (Exception $e) {
            wp_send_json_error('Failed to scan website: ' . $e->getMessage());
        }
    }

    /**
     * Save scanned content to WordPress uploads folder (server-side)
     * Called directly during scan to avoid WAF blocking large AJAX POST requests
     *
     * @param string $content The markdown content to save
     */
    private function save_content_locally($content) {
        $upload_dir = wp_upload_dir();
        $aizlabs_dir = $upload_dir['basedir'] . '/aizlabs-chat-agent';

        if (!file_exists($aizlabs_dir)) {
            wp_mkdir_p($aizlabs_dir);
        }

        $file_path = $aizlabs_dir . '/website_content.md';

        global $wp_filesystem;
        if (empty($wp_filesystem)) {
            require_once ABSPATH . 'wp-admin/includes/file.php';
            WP_Filesystem();
        }

        if (!empty($wp_filesystem) && $wp_filesystem->method === 'direct') {
            $wp_filesystem->put_contents($file_path, $content, FS_CHMOD_FILE);
        } else {
            // phpcs:ignore WordPress.WP.AlternativeFunctions.file_system_operations_file_put_contents
            file_put_contents($file_path, $content);
        }

        // phpcs:ignore WordPress.PHP.DevelopmentFunctions.error_log_error_log
        error_log('Site Scanner - Local file saved: ' . $file_path);
    }

    private function normalize_url($url) {
        // Remove trailing slash
        $url = rtrim($url, '/');

        // Convert to lowercase for comparison
        $url = strtolower($url);

        return $url;
    }

    private function generate_website_markdown() {
        $markdown = "# Website Content Knowledge Base\n\n";
        $markdown .= "Generated on: " . wp_date('Y-m-d H:i:s') . "\n\n";
        $markdown .= "---\n\n";

        // Track failed pages
        $failed_pages = array();

        // Get site info
        $markdown .= "## Site Information\n\n";
        $markdown .= "- **Site Name:** " . get_bloginfo('name') . "\n";
        $markdown .= "- **Site URL:** " . get_site_url() . "\n";
        $markdown .= "- **Description:** " . get_bloginfo('description') . "\n\n";

        // Collect all URLs to scan
        $urls_to_scan = array();

        // Always add homepage first
        $homepage_url = get_home_url();
        $normalized_key = $this->normalize_url($homepage_url);
        $urls_to_scan[$normalized_key] = array(
            'title' => 'Homepage',
            'url' => $homepage_url
        );

        // Get navigation menu URLs
        $markdown .= "## Navigation Menus\n\n";
        $menus = wp_get_nav_menus();

        if ($menus) {
            foreach ($menus as $menu) {
                $markdown .= "### " . esc_html($menu->name) . "\n\n";
                $menu_items = wp_get_nav_menu_items($menu->term_id);

                if ($menu_items) {
                    foreach ($menu_items as $item) {
                        $markdown .= "- " . esc_html($item->title) . ": " . esc_url($item->url) . "\n";

                        // Only add internal URLs to scan list
                        $site_url = get_site_url();
                        if (strpos($item->url, $site_url) === 0) {
                            // Use normalized URL as key to avoid duplicates
                            $normalized_key = $this->normalize_url($item->url);

                            // Only add if not already in array (prevents duplicates)
                            if (!isset($urls_to_scan[$normalized_key])) {
                                $urls_to_scan[$normalized_key] = array(
                                    'title' => $item->title,
                                    'url' => $item->url
                                );
                            }
                        }
                        // External URLs are skipped - they still appear in menu list but not in content
                    }
                }
                $markdown .= "\n";
            }
        } else {
            $markdown .= "No navigation menus found.\n\n";
        }

        // Fallback: If no menus or only homepage, try alternative methods
        if (count($urls_to_scan) === 1) { // Only homepage in scan list
            $fallback_urls_added = false;

            // Fallback Level 1: Try sitemap.xml
            $sitemap_url = trailingslashit(get_site_url()) . 'sitemap.xml';
            $sitemap_response = wp_remote_get($sitemap_url, array(
                'timeout' => 10,
                'sslverify' => false
            ));

            if (!is_wp_error($sitemap_response) && wp_remote_retrieve_response_code($sitemap_response) === 200) {
                $sitemap_xml = wp_remote_retrieve_body($sitemap_response);

                // Parse sitemap XML
                $urls_from_sitemap = $this->parse_sitemap_xml($sitemap_xml);

                if (!empty($urls_from_sitemap)) {
                    foreach ($urls_from_sitemap as $url_data) {
                        $normalized_key = $this->normalize_url($url_data['url']);

                        if (!isset($urls_to_scan[$normalized_key])) {
                            $urls_to_scan[$normalized_key] = $url_data;
                        }
                    }

                    $fallback_urls_added = true;
                }
            }

            // Fallback Level 2: If sitemap failed, get all published pages
            if (!$fallback_urls_added) {
                $pages = get_pages(array(
                    'post_status' => 'publish',
                    'sort_column' => 'menu_order, post_title',
                    'number' => 50 // Limit to prevent excessive scanning
                ));

                foreach ($pages as $page) {
                    $page_url = get_permalink($page->ID);
                    $normalized_key = $this->normalize_url($page_url);

                    if (!isset($urls_to_scan[$normalized_key])) {
                        $urls_to_scan[$normalized_key] = array(
                            'title' => $page->post_title,
                            'url' => $page_url
                        );
                    }
                }
            }
        }

        // Fetch and extract content from each URL
        $markdown .= "## Page Content\n\n";

        foreach ($urls_to_scan as $page_data) {
            $title = $page_data['title'];
            $url = $page_data['url'];

            $markdown .= "### " . esc_html($title) . "\n\n";
            $markdown .= "**URL:** " . esc_url($url) . "\n\n";

            // Fetch page content
            $content = $this->fetch_page_content($url);

            if ($content) {
                $markdown .= "**Content:**\n\n" . $content . "\n\n";
                $markdown .= "---\n\n";
            } else {
                // Track failed page instead of adding to markdown
                $failed_pages[] = $title . ' (' . $url . ')';
            }
        }

        // Check if any pages failed to fetch
        if (!empty($failed_pages)) {
            // phpcs:ignore WordPress.PHP.DevelopmentFunctions.error_log_error_log
            error_log('Site Scanner - Warning: ' . count($failed_pages) . ' pages had no content: ' . implode(', ', $failed_pages));
            // Continue and return markdown with available content
        }

        return $markdown;
    }

    private function fetch_page_content($url) {
        // phpcs:ignore WordPress.PHP.DevelopmentFunctions.error_log_error_log
        error_log('Site Scanner - Attempting to fetch: ' . $url);

        // Only fetch content from same site (security measure)
        $site_url = get_site_url();

        // phpcs:ignore WordPress.PHP.DevelopmentFunctions.error_log_error_log
        error_log('Site Scanner - Site URL: ' . $site_url);

        // Normalize URLs before comparison to handle trailing slashes
        $normalized_url = $this->normalize_url($url);
        $normalized_site_url = $this->normalize_url($site_url);

        // phpcs:ignore WordPress.PHP.DevelopmentFunctions.error_log_error_log
        error_log('Site Scanner - Normalized URL: ' . $normalized_url);
        // phpcs:ignore WordPress.PHP.DevelopmentFunctions.error_log_error_log
        error_log('Site Scanner - Normalized Site URL: ' . $normalized_site_url);

        if (strpos($normalized_url, $normalized_site_url) !== 0) {
            // phpcs:ignore WordPress.PHP.DevelopmentFunctions.error_log_error_log
            error_log('Site Scanner - REJECTED: URL does not match site URL');
            return null; // Skip external URLs
        }

        // phpcs:ignore WordPress.PHP.DevelopmentFunctions.error_log_error_log
        error_log('Site Scanner - PASSED: Security check, fetching via HTTP...');

        // Try Method 1: HTTP fetch with original URL
        $response = wp_remote_get($url, array(
            'timeout' => 15,
            'sslverify' => false,
            'headers' => array(
                'User-Agent' => 'WordPress/' . get_bloginfo('version') . '; ' . get_bloginfo('url')
            )
        ));

        // Check for errors
        if (is_wp_error($response)) {
            // phpcs:ignore WordPress.PHP.DevelopmentFunctions.error_log_error_log
            error_log('Site Scanner - HTTP fetch failed: ' . $response->get_error_message() . ', trying database fallback');
            return $this->fetch_content_from_database($url);
        }

        $status_code = wp_remote_retrieve_response_code($response);
        if ($status_code !== 200) {
            // phpcs:ignore WordPress.PHP.DevelopmentFunctions.error_log_error_log
            error_log('Site Scanner - HTTP ' . $status_code . ' for ' . $url . ', trying database fallback');
            return $this->fetch_content_from_database($url);
        }

        // Get the HTML body
        $html = wp_remote_retrieve_body($response);

        if (empty($html)) {
            // phpcs:ignore WordPress.PHP.DevelopmentFunctions.error_log_error_log
            error_log('Site Scanner - Empty response, trying database fallback');
            return $this->fetch_content_from_database($url);
        }

        // Extract text content from HTML
        $content = $this->extract_text_from_html($html);

        return $content;
    }

    private function extract_text_from_html($html) {
        // Remove script and style tags and their content
        $html = preg_replace('/<script\b[^>]*>(.*?)<\/script>/is', '', $html);
        $html = preg_replace('/<style\b[^>]*>(.*?)<\/style>/is', '', $html);

        // Remove HTML comments
        $html = preg_replace('/<!--(.|\s)*?-->/', '', $html);

        // Strip all remaining HTML tags
        $text = wp_strip_all_tags($html);

        // Decode HTML entities
        $text = html_entity_decode($text, ENT_QUOTES | ENT_HTML5, 'UTF-8');

        // Remove excessive whitespace
        $text = preg_replace('/\s+/', ' ', $text);
        $text = trim($text);

        // Limit length to prevent huge files (optional)
        if (strlen($text) > 50000) {
            $text = substr($text, 0, 50000) . '... (content truncated)';
        }

        return $text;
    }

    /**
     * Fetch content from WordPress database as fallback
     * Uses apply_filters('the_content') to render shortcodes
     */
    private function fetch_content_from_database($url) {
        // Extract path from URL
        $path = parse_url($url, PHP_URL_PATH);
        $path = trim($path ?? '', '/');  // Use null coalescing for PHP 8.2 compatibility

        // phpcs:ignore WordPress.PHP.DevelopmentFunctions.error_log_error_log
        error_log('Site Scanner - Database fallback: Attempting path: ' . $path);

        // Handle homepage
        if (empty($path)) {
            $page_id = get_option('page_on_front');
            if ($page_id) {
                $page = get_post($page_id);
            } else {
                return null; // No static homepage
            }
        } else {
            // Try to get page by path
            $page = get_page_by_path($path);

            // If not found as page, try as post
            if (!$page) {
                $page = get_page_by_path($path, OBJECT, 'post');
            }
        }

        if (!$page) {
            // phpcs:ignore WordPress.PHP.DevelopmentFunctions.error_log_error_log
            error_log('Site Scanner - Database fallback: Page not found for path: ' . $path);
            return null;
        }

        // phpcs:ignore WordPress.PHP.DevelopmentFunctions.error_log_error_log
        error_log('Site Scanner - Database fallback: Found page: ' . $page->post_title);

        // Apply content filters to render shortcodes
        $content = apply_filters('the_content', $page->post_content);

        // Extract text from HTML
        $text = $this->extract_text_from_html($content);

        return $text;
    }

    public function aizl_upload_scanned_content() {
        if (!check_ajax_referer('aizl_setting', 'nonce', false)) {
            wp_send_json_error('Invalid nonce.');
            return;
        }

        if (!isset($_POST['file_content']) || !isset($_POST['file_name']) || !isset($_POST['file_size'])) {
            wp_send_json_error('Missing required parameters.');
            return;
        }

        $file_content = wp_unslash($_POST['file_content']);
        $file_name = sanitize_file_name(wp_unslash($_POST['file_name']));
        $file_size = intval($_POST['file_size']);

        // Check file size
        if ($file_size > 10 * 1024 * 1024) {
            wp_send_json_error('Generated file too large. Maximum 10MB allowed.');
            return;
        }

        $db_handler = new DBHandler();
        $access_key = $db_handler->get_access_key();
        if (!$access_key) {
            wp_send_json_error('Invalid Access Key');
            return;
        }

        // Get presigned URL for S3 upload
        $presigned_response = $this->get_presigned_url($file_name, $file_size, $access_key);

        if (is_wp_error($presigned_response)) {
            wp_send_json_error('Failed to get upload URL.');
            return;
        }

        if (wp_remote_retrieve_response_code($presigned_response) != 200) {
            wp_send_json_error('Failed to get upload URL.');
            return;
        }

        $presigned_data = json_decode(wp_remote_retrieve_body($presigned_response), true);
        $upload_url = $presigned_data['upload_url'];
        $file_key = $presigned_data['file_key'];

        // Upload content directly to S3
        $s3_response = wp_remote_request($upload_url, array(
            'method' => 'PUT',
            'headers' => array(
                'Content-Type' => 'application/octet-stream',
            ),
            'body' => $file_content,
            'timeout' => 60,
        ));

        if (is_wp_error($s3_response) || wp_remote_retrieve_response_code($s3_response) != 200) {
            wp_send_json_error('Failed to upload to S3.');
            return;
        }

        // Return file info for processing
        wp_send_json_success(array(
            'file_key' => $file_key,
            'file_name' => $file_name,
            'file_size' => $file_size
        ));
    }

    /**
     * Parse sitemap XML and extract URLs
     *
     * @param string $xml The sitemap XML content
     * @return array Array of URL data (title, url)
     */
    private function parse_sitemap_xml($xml) {
        $urls = array();

        // Suppress XML errors
        libxml_use_internal_errors(true);

        $sitemap = simplexml_load_string($xml);

        if ($sitemap === false) {
            // phpcs:ignore WordPress.PHP.DevelopmentFunctions.error_log_error_log
            error_log('Site Scanner - Failed to parse sitemap XML');
            return $urls;
        }

        // Register XML namespaces
        $namespaces = $sitemap->getNamespaces(true);

        // Handle standard sitemap
        if (isset($sitemap->url)) {
            foreach ($sitemap->url as $url_entry) {
                $loc = (string) $url_entry->loc;

                // Only include internal URLs
                $site_url = get_site_url();
                if (strpos($loc, $site_url) === 0) {
                    // Extract title from URL (last segment)
                    $path = parse_url($loc, PHP_URL_PATH);
                    $title = ucwords(str_replace(array('/', '-', '_'), ' ', trim($path, '/')));
                    if (empty($title)) {
                        $title = 'Homepage';
                    }

                    $urls[] = array(
                        'title' => $title,
                        'url' => $loc
                    );
                }
            }
        }

        // Handle sitemap index (links to other sitemaps)
        if (isset($sitemap->sitemap)) {
            foreach ($sitemap->sitemap as $sitemap_entry) {
                $sitemap_loc = (string) $sitemap_entry->loc;

                // Fetch and parse child sitemap
                $child_response = wp_remote_get($sitemap_loc, array(
                    'timeout' => 10,
                    'sslverify' => false
                ));

                if (!is_wp_error($child_response) && wp_remote_retrieve_response_code($child_response) === 200) {
                    $child_xml = wp_remote_retrieve_body($child_response);
                    $child_urls = $this->parse_sitemap_xml($child_xml);
                    $urls = array_merge($urls, $child_urls);
                }
            }
        }

        // Limit to 50 URLs to prevent excessive scanning
        if (count($urls) > 50) {
            $urls = array_slice($urls, 0, 50);
        }

        return $urls;
    }

    private function get_presigned_url($file_name, $file_size, $access_key) {
        $api_url = Config::get_api_endpoint('presign');
        $api_response = wp_remote_post($api_url, array(
            'method' => 'POST',
            'body' => json_encode(array(
                'file_name' => $file_name,
                'file_size' => $file_size
            )),
            'headers' => array(
                'Content-Type' => 'application/json',
                'x-access-key' => $access_key,
                'x-plugin-source' => 'aizlabs_chat_agent'
            ),
            'timeout' => 60,
        ));
        return $api_response;
    }
}
