<?php
/**
 * Scanner for Cached HTML Files
 *
 * Scans Static Cache Wrangler output directory and catalogs
 * all HTML files available for conversion.
 *
 * @package STCWHeadlessAssistant
 * @since 2.0.0
 */

namespace STCW\Headless;

if (!defined('ABSPATH')) exit;

class Scanner {
    
    /**
     * Get default excluded paths
     * 
     * Paths that should be excluded from scans.
     * Can be filtered via 'stcw_headless_excluded_paths'
     * 
     * @return array Excluded path patterns
     */
    private function get_excluded_paths() {
        $excluded = [
            'assets',           // Asset files
            'index.php',        // WordPress quirk - should never be a directory
            'author',           // Author archives (usually not needed)
            'tag',              // Tag archives
            'category',         // Category archives
            'feed',             // RSS feeds
            'wp-json',          // REST API endpoints
            'sitemap',          // XML sitemaps
            '404',              // Error pages
        ];
        
        /**
         * Filter excluded paths
         * 
         * @param array $excluded Array of path patterns to exclude
         */
        return apply_filters('stcw_headless_excluded_paths', $excluded);
    }
    
    /**
     * Check if path should be excluded
     * 
     * @param string $relative_path Relative path from static dir
     * @return bool True if should be excluded
     */
    private function should_exclude_path($relative_path) {
        $excluded = $this->get_excluded_paths();
        
        foreach ($excluded as $pattern) {
            // Check if path starts with excluded pattern
            if (strpos($relative_path, $pattern) === 0) {
                return true;
            }
            
            // Check if path contains excluded pattern
            if (strpos($relative_path, '/' . $pattern . '/') !== false) {
                return true;
            }
        }
        
        return false;
    }
    
    /**
     * Scan all cached HTML files in SCW static directory
     * 
     * @return array Array of file data with paths and metadata
     */
    public function scan_cached_files() {
        $static_dir = Core::get_scw_static_dir();
        
        if (empty($static_dir) || !is_dir($static_dir)) {
            return [
                'success' => false,
                'message' => 'Static Cache Wrangler directory not found.',
                'files' => [],
            ];
        }
        
        $files = [];
        
        try {
            $iterator = new \RecursiveIteratorIterator(
                new \RecursiveDirectoryIterator($static_dir, \FilesystemIterator::SKIP_DOTS),
                \RecursiveIteratorIterator::SELF_FIRST
            );
            
            foreach ($iterator as $file) {
                // Only process index.html files (skip assets)
                if (!$file->isFile() || $file->getFilename() !== 'index.html') {
                    continue;
                }
                
                $file_path = $file->getPathname();
                
                // Get relative path from static directory
                $relative_path = str_replace(trailingslashit($static_dir), '', dirname($file_path) . '/');
                $relative_path = trim($relative_path, '/');
                
                // Skip excluded paths
                if ($this->should_exclude_path($relative_path)) {
                    continue;
                }
                
                // Determine URL path
                if (empty($relative_path)) {
                    $url_path = '/';
                } else {
                    $url_path = '/' . trailingslashit($relative_path);
                }
                
                // Get file metadata
                $files[] = [
                    'path' => $file_path,
                    'relative_path' => $relative_path,
                    'url_path' => $url_path,
                    'size' => $file->getSize(),
                    'modified' => $file->getMTime(),
                    'modified_formatted' => gmdate('Y-m-d H:i:s', $file->getMTime()),
                ];
            }
        } catch (\Exception $e) {
            return [
                'success' => false,
                'message' => sprintf('Error scanning files: %s', $e->getMessage()),
                'files' => [],
            ];
        }
        
        // Sort by URL path
        usort($files, function($a, $b) {
            return strcmp($a['url_path'], $b['url_path']);
        });
        
        return [
            'success' => true,
            'message' => sprintf('Found %d cached HTML files', count($files)),
            'files' => $files,
            'count' => count($files),
        ];
    }
    
    /**
     * Get quick scan statistics
     * 
     * @return array Statistics about cached files
     */
    public function get_scan_stats() {
        $scan = $this->scan_cached_files();
        
        if (!$scan['success']) {
            return [
                'total_files' => 0,
                'total_size' => 0,
                'formatted_size' => '0 B',
                'oldest_file' => null,
                'newest_file' => null,
            ];
        }
        
        $total_size = 0;
        $oldest = null;
        $newest = null;
        
        foreach ($scan['files'] as $file) {
            $total_size += $file['size'];
            
            if ($oldest === null || $file['modified'] < $oldest) {
                $oldest = $file['modified'];
            }
            
            if ($newest === null || $file['modified'] > $newest) {
                $newest = $file['modified'];
            }
        }
        
        return [
            'total_files' => count($scan['files']),
            'total_size' => $total_size,
            'formatted_size' => size_format($total_size),
            'oldest_file' => $oldest ? gmdate('Y-m-d H:i:s', $oldest) : null,
            'newest_file' => $newest ? gmdate('Y-m-d H:i:s', $newest) : null,
        ];
    }
}
