<?php
/**
 * Live Site Scan - Analyzes rendered front-end HTML to detect dynamic media usage
 */
class ORPHANIX_Live_Scan {
    protected $max_urls = 500;
    protected $max_html_size = 2097152; // 2MB

    /**
     * Initialize Live Scan progress and reset counters
     */
    public function init_live_scan($media_scan_id) {
        global $wpdb;

        // Reset live scan flags for this scan
        // phpcs:ignore WordPress.DB.DirectDatabaseQuery.DirectQuery, WordPress.DB.DirectDatabaseQuery.NoCaching
        $wpdb->update(
            "{$wpdb->prefix}orphanix_scan_items",
            [
                'live_used' => 0,
                'live_url_count' => 0,
            ],
            ['scan_id' => $media_scan_id],
            ['%d', '%d'],
            ['%d']
        );

        $urls = $this->get_public_urls();
        $total_urls = count($urls);

        set_transient('orphanix_live_scan_progress_' . $media_scan_id, [
            'status' => 'running',
            'processed' => 0,
            'total' => $total_urls,
            'percentage' => 0,
            'current_url' => '',
            'urls' => $urls,
            'index' => 0,
        ], 3600);

        update_option('orphanix_last_live_scan_at', current_time('mysql'));
        update_option('orphanix_last_live_scan_status', 'running');
        update_option('orphanix_last_live_scan_mode', 'Full Site');
    }

    /**
     * Process a batch of URLs for Live Scan
     */
    public function process_live_scan_batch($media_scan_id, $batch_size = 5) {
        global $wpdb;

        $progress = get_transient('orphanix_live_scan_progress_' . $media_scan_id);
        if (empty($progress) || empty($progress['urls'])) {
            return $progress;
        }

        $urls = $progress['urls'];
        $total_urls = intval($progress['total']);
        $index = intval($progress['index']);

        $end = min($index + $batch_size, $total_urls);

        for ($i = $index; $i < $end; $i++) {
            $url = $urls[$i];
            $page_title = $this->get_page_title($url);
            $page_media = $this->fetch_and_parse_url($url);

            if (is_array($page_media)) {
                foreach ($page_media as $media_url) {
                    $resolved = $this->resolve_url_to_attachment($media_url);
                    if ($resolved && $resolved['type'] === 'attachment') {
                        $attachment_id = $resolved['id'];

                        // phpcs:ignore WordPress.DB.DirectDatabaseQuery.DirectQuery, WordPress.DB.DirectDatabaseQuery.NoCaching
                        $items = $wpdb->get_results($wpdb->prepare(
                            "SELECT id FROM {$wpdb->prefix}orphanix_scan_items 
                             WHERE scan_id = %d AND attachment_id = %d",
                            $media_scan_id,
                            $attachment_id
                        ));

                        foreach ($items as $item) {
                            $this->increment_live_usage($item->id, $url, $page_title);
                        }
                    }
                }
            }

            $index++;
        }

        $percentage = $total_urls > 0 ? round(($index) / $total_urls * 100) : 100;
        $current_url = ($index > 0 && isset($urls[$index - 1])) ? $urls[$index - 1] : '';

        $status = ($index >= $total_urls) ? 'completed' : 'running';

        $progress = [
            'status' => $status,
            'processed' => $index,
            'total' => $total_urls,
            'percentage' => $percentage,
            'current_url' => $current_url,
            'urls' => $urls,
            'index' => $index,
        ];

        if ($status === 'completed') {
            unset($progress['urls']);
        }

        set_transient('orphanix_live_scan_progress_' . $media_scan_id, $progress, 3600);
        if ($status === 'completed') {
            update_option('orphanix_last_live_scan_at', current_time('mysql'));
            update_option('orphanix_last_live_scan_status', 'completed');
        }
        return $progress;
    }

    /**
     * Increment live usage for a scan item
     */
    private function increment_live_usage($item_id, $page_url, $page_title) {
        global $wpdb;

        // Update count
        // phpcs:ignore WordPress.DB.DirectDatabaseQuery.DirectQuery, WordPress.DB.DirectDatabaseQuery.NoCaching
        $wpdb->query(
            $wpdb->prepare(
                "UPDATE {$wpdb->prefix}orphanix_scan_items 
                 SET live_used = 1, live_url_count = live_url_count + 1 
                 WHERE id = %d",
                $item_id
            )
        );

        // Store URL details
        // phpcs:ignore WordPress.DB.DirectDatabaseQuery.DirectQuery, WordPress.DB.DirectDatabaseQuery.NoCaching
        $wpdb->insert(
            "{$wpdb->prefix}orphanix_live_scan_details",
            [
                'scan_item_id' => $item_id,
                'page_url' => $page_url,
                'page_title' => substr($page_title, 0, 255),
                'created_at' => current_time('mysql')
            ],
            ['%d', '%s', '%s', '%s']
        );
    }

    /**
     * Get page title from URL
     */
    private function get_page_title($url) {
        $response = wp_remote_get($url, [
            'timeout' => 5,
        ]);

        if (is_wp_error($response)) {
            return 'Unknown';
        }

        $html = wp_remote_retrieve_body($response);

        if (preg_match('/<title[^>]*>(.*?)<\/title>/si', $html, $match)) {
            return trim( wp_strip_all_tags( $match[1] ) );
        }

        // Fallback: try to get post title from URL
        $post_id = url_to_postid($url);
        if ($post_id) {
            return get_the_title($post_id);
        }

        return 'Unknown';
    }

    /**
     * Get all public URLs to scan
     */
    public function get_public_urls() {
        $urls = [];
        $batch_size = 200;

        // Homepage
        $urls[] = home_url('/');

        // Get all public post types
        $post_types = get_post_types(['public' => true], 'names');
        $reached_limit = false;

        foreach ($post_types as $pt) {
            $paged = 1;

            while ( true ) {
                $posts = get_posts([
                    'post_type'      => $pt,
                    'post_status'    => 'publish',
                    'posts_per_page' => $batch_size,
                    'paged'          => $paged,
                    'fields'         => 'ids',
                    'no_found_rows'  => true,
                    'orderby'        => 'ID',
                    'order'          => 'ASC',
                ]);

                if ( empty( $posts ) ) {
                    break;
                }

                foreach ( $posts as $id ) {
                    $urls[] = get_permalink( $id );
                    if ( count( $urls ) >= $this->max_urls ) {
                        $reached_limit = true;
                        break;
                    }
                }

                if ( $reached_limit || count( $posts ) < $batch_size ) {
                    break;
                }

                $paged++;
            }

            if ( $reached_limit ) {
                break;
            }
        }

        // Optional: Add archive pages if setting enabled
        if (get_option('orphanix_live_scan_archives', false)) {
            $urls = array_merge($urls, $this->get_archive_urls());
        }

        $urls = array_unique($urls);
        return array_slice($urls, 0, $this->max_urls);
    }

    /**
     * Get archive page URLs (categories, tags, etc.)
     */
    private function get_archive_urls() {
        $urls = [];

        // Category archives
        $categories = get_categories(['hide_empty' => false]);
        foreach ($categories as $cat) {
            $urls[] = get_category_link($cat->term_id);
        }

        // Tag archives
        $tags = get_tags(['hide_empty' => false]);
        foreach ($tags as $tag) {
            $urls[] = get_tag_link($tag->term_id);
        }

        return $urls;
    }

    /**
     * Fetch and parse a single URL
     */
    public function fetch_and_parse_url($url) {
        $response = wp_remote_get($url, [
            'timeout'  => 5,  // Reduced from 10 to 5 seconds
            'headers'  => [
                'User-Agent' => 'Orphanix Media Cleanup Live Scan'
            ],
            'blocking' => true,
        ]);

        if (is_wp_error($response)) {
            // Log error but continue scanning
            return [];
        }

        $html = wp_remote_retrieve_body($response);

        // Check size limit
        if (strlen($html) > $this->max_html_size) {
            return [];
        }

        try {
            return $this->extract_media_urls($html);
        } catch (Exception $e) {
            // If parsing fails, skip this page
            return [];
        }
    }

    /**
     * Extract media URLs from HTML
     */
    private function extract_media_urls($html) {
        $media_urls = [];

        libxml_use_internal_errors(true);

        try {
            $dom = new DOMDocument();
            // Suppress warnings for malformed HTML
            $dom->loadHTML(mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8'));
            libxml_clear_errors();
            $xpath = new DOMXPath($dom);

            // Extract image src attributes
            $img_nodes = $xpath->query('//img[@src]');
            foreach ($img_nodes as $node) {
                $src = $node->getAttribute('src');
                if (!empty($src)) {
                    $media_urls[] = $this->normalize_url($src);
                }
            }

            // Extract srcset attributes
            $srcset_nodes = $xpath->query('//img[@srcset]');
            foreach ($srcset_nodes as $node) {
                $srcset = $node->getAttribute('srcset');
                if (!empty($srcset)) {
                    // Parse srcset format: "url1 1x, url2 2x"
                    $sources = explode(',', $srcset);
                    foreach ($sources as $source) {
                        $parts = explode(' ', trim($source));
                        if (!empty($parts[0])) {
                            $media_urls[] = $this->normalize_url($parts[0]);
                        }
                    }
                }
            }

            // Extract background images from style attributes
            $style_nodes = $xpath->query('//*[@style]');
            foreach ($style_nodes as $node) {
                $style = $node->getAttribute('style');
                if (preg_match('/background(?:-image)?\s*:\s*url\([\'"]?([^\'")]+)[\'"]?\)/i', $style, $matches)) {
                    $media_urls[] = $this->normalize_url($matches[1]);
                }
            }

            // Extract video sources
            $video_sources = $xpath->query('//video//source[@src]');
            foreach ($video_sources as $node) {
                $src = $node->getAttribute('src');
                if (!empty($src)) {
                    $media_urls[] = $this->normalize_url($src);
                }
            }

            // Extract audio sources
            $audio_sources = $xpath->query('//audio//source[@src]');
            foreach ($audio_sources as $node) {
                $src = $node->getAttribute('src');
                if (!empty($src)) {
                    $media_urls[] = $this->normalize_url($src);
                }
            }

            libxml_clear_errors();
            
            // Unset DOM to free memory
            unset($dom, $xpath);
        } catch (Exception $e) {
            libxml_clear_errors();
        }

        return array_unique($media_urls);
    }

    /**
     * Normalize URL to absolute path and remove query strings
     */
    private function normalize_url($url) {
        // Remove query strings and fragments
        $url = strtok($url, '?');
        $url = strtok($url, '#');

        // Convert to absolute URL if relative
        if (strpos($url, 'http') !== 0) {
            $url = site_url($url);
        }

        return wp_normalize_path($url);
    }

    /**
     * Resolve URL to attachment ID or file path
     */
    public function resolve_url_to_attachment($url) {
        // First try to get attachment ID
        $attachment_id = attachment_url_to_postid($url);
        if ($attachment_id) {
            return ['type' => 'attachment', 'id' => $attachment_id];
        }

        // Check if it's in uploads directory
        $upload_dir = wp_get_upload_dir();
        if (strpos($url, $upload_dir['baseurl']) === 0) {
            $file_path = str_replace(
                $upload_dir['baseurl'],
                $upload_dir['basedir'],
                $url
            );

            if (file_exists($file_path)) {
                return ['type' => 'file', 'path' => $file_path];
            }
        }

        return null;
    }

    /**
     * Run live scan for a media scan ID
     */
    public function run_live_scan($media_scan_id) {
        global $wpdb;

        // Get all URLs to scan
        $urls = $this->get_public_urls();
        $total_urls = count($urls);

        // Track which media is found in live scan
        $media_found = [];
        $batch_count = 0;

        try {
            foreach ($urls as $index => $url) {
                // Extract media from this page
                $page_media = $this->fetch_and_parse_url($url);

                if (is_array($page_media)) {
                    foreach ($page_media as $media_url) {
                        $resolved = $this->resolve_url_to_attachment($media_url);

                        if ($resolved) {
                            if ($resolved['type'] === 'attachment') {
                                $attachment_id = $resolved['id'];

                                // Find scan items for this attachment
                                // phpcs:ignore WordPress.DB.DirectDatabaseQuery.DirectQuery, WordPress.DB.DirectDatabaseQuery.NoCaching
                                $items = $wpdb->get_results($wpdb->prepare(
                                    "SELECT id FROM {$wpdb->prefix}orphanix_scan_items 
                                     WHERE scan_id = %d AND attachment_id = %d",
                                    $media_scan_id,
                                    $attachment_id
                                ));

                                foreach ($items as $item) {
                                    if (!isset($media_found[$item->id])) {
                                        $media_found[$item->id] = 0;
                                    }
                                    $media_found[$item->id]++;
                                }
                            }
                        }
                    }
                }

                // Update progress transient
                $percentage = round(($index + 1) / $total_urls * 100);
                set_transient('orphanix_live_scan_progress_' . $media_scan_id, [
                    'processed' => $index + 1,
                    'total'     => $total_urls,
                    'percentage' => $percentage,
                    'current_url' => substr($url, 0, 100), // Limit URL length
                    'status'    => 'running',
                ], 3600);

                $batch_count++;
                
                // Free up memory and reset cache every 5 URLs
                if ($batch_count >= 5) {
                    wp_cache_flush();
                    gc_collect_cycles();
                    $batch_count = 0;
                }
            }
        } catch (Throwable $e) {
            // Log error but continue
            if ( class_exists( 'ORPHANIX_Logger' ) ) {
                ORPHANIX_Logger::log( 'Live Scan Error', [
                    'url_index' => isset($index) ? $index + 1 : 0,
                    'message' => $e->getMessage(),
                ] );
            }
        }

        // Update scan items with live usage data in batches
        $batch_size = 100;
        $items_to_update = array_chunk(array_keys($media_found), $batch_size, true);
        
        foreach ($items_to_update as $batch) {
            foreach ($batch as $item_id) {
                // phpcs:ignore WordPress.DB.DirectDatabaseQuery.DirectQuery, WordPress.DB.DirectDatabaseQuery.NoCaching
                $wpdb->update(
                    "{$wpdb->prefix}orphanix_scan_items",
                    [
                        'live_used' => 1,
                        'live_url_count' => intval($media_found[$item_id]),
                    ],
                    ['id' => intval($item_id)],
                    ['%d', '%d'],
                    ['%d']
                );
            }
            wp_cache_flush();
        }

        // Mark live scan as complete
        set_transient('orphanix_live_scan_progress_' . $media_scan_id, [
            'processed' => $total_urls,
            'total'     => $total_urls,
            'percentage' => 100,
            'current_url' => 'Complete',
            'status'    => 'completed',
        ], 3600);

        return [
            'total_urls' => $total_urls,
            'media_found' => count($media_found),
        ];
    }

    /**
     * Get live scan progress
     */
    public function get_live_scan_progress($scan_id) {
        return get_transient('orphanix_live_scan_progress_' . $scan_id);
    }
}
