<?php
/**
 * RankJet AI Redirect AI Suggestions
 * 
 * AI-powered URL similarity matching using Levenshtein distance,
 * Jaro-Winkler algorithm, and n-gram tokenization for suggesting
 * redirects for 404 URLs.
 * 
 * @package RankJet_AI
 * @since 1.1.0
 */

if (!defined('ABSPATH')) {
    exit;
}

class Rankjet_Ai_Redirect_AI {

    /**
     * Minimum similarity score threshold
     */
    const MIN_THRESHOLD = 0.5;

    /**
     * Maximum suggestions to return
     */
    const MAX_SUGGESTIONS = 5;

    /**
     * Cache for candidate URLs
     */
    private $candidates_cache = null;

    /**
     * Suggest redirect destinations for a broken URL
     * 
     * @param string $broken_url The 404 URL
     * @return array Array of suggestions with URL and confidence score
     */
    public function suggest_redirect($broken_url) {
        $slug = $this->extract_slug($broken_url);
        
        if (empty($slug)) {
            return [];
        }

        $candidates = $this->get_candidate_urls();
        $suggestions = [];

        foreach ($candidates as $candidate) {
            $score = $this->calculate_similarity($slug, $candidate['slug']);
            
            if ($score >= self::MIN_THRESHOLD) {
                $suggestions[] = [
                    'url' => $candidate['url'],
                    'title' => $candidate['title'],
                    'score' => round($score, 4),
                    'type' => $candidate['type'],
                ];
            }
        }

        // Sort by score descending
        usort($suggestions, function($a, $b) {
            return $b['score'] <=> $a['score'];
        });

        return array_slice($suggestions, 0, self::MAX_SUGGESTIONS);
    }

    /**
     * Extract the slug from a URL
     */
    private function extract_slug($url) {
        // Remove query string
        $url = strtok($url, '?');
        
        // Get path
        $path = wp_parse_url($url, PHP_URL_PATH);
        if (!$path) {
            $path = $url;
        }
        
        // Remove leading/trailing slashes
        $path = trim($path, '/');
        
        // Get the last segment (most likely the slug)
        $segments = explode('/', $path);
        $slug = end($segments);
        
        // Remove common file extensions
        $slug = preg_replace('/\.(html?|php|asp|aspx)$/i', '', $slug);
        
        // Normalize: replace hyphens/underscores with spaces, lowercase
        $slug = strtolower(str_replace(['-', '_'], ' ', $slug));
        
        return $slug;
    }

    /**
     * Get candidate URLs from the database
     */
    private function get_candidate_urls() {
        if ($this->candidates_cache !== null) {
            return $this->candidates_cache;
        }

        $candidates = [];

        // Get published posts and pages
        $post_types = get_post_types(['public' => true], 'names');
        
        $posts = get_posts([
            'post_type' => $post_types,
            'post_status' => 'publish',
            'posts_per_page' => 500, // Limit for performance
            'orderby' => 'post_modified',
            'order' => 'DESC',
            'no_found_rows' => true,
        ]);

        foreach ($posts as $post) {
            $url = get_permalink($post->ID);
            $slug = $this->extract_slug($url);
            
            if (!empty($slug)) {
                $candidates[] = [
                    'url' => $url,
                    'slug' => $slug,
                    'title' => $post->post_title,
                    'type' => $post->post_type,
                ];
            }
        }

        // Get taxonomy terms
        $taxonomies = get_taxonomies(['public' => true], 'names');
        
        foreach ($taxonomies as $taxonomy) {
            $terms = get_terms([
                'taxonomy' => $taxonomy,
                'hide_empty' => true,
                'number' => 200, // Limit per taxonomy
            ]);

            if (!is_wp_error($terms)) {
                foreach ($terms as $term) {
                    $url = get_term_link($term);
                    if (!is_wp_error($url)) {
                        $slug = strtolower(str_replace(['-', '_'], ' ', $term->slug));
                        $candidates[] = [
                            'url' => $url,
                            'slug' => $slug,
                            'title' => $term->name,
                            'type' => $taxonomy,
                        ];
                    }
                }
            }
        }

        $this->candidates_cache = $candidates;
        return $candidates;
    }

    /**
     * Calculate similarity between two strings using multiple algorithms
     */
    public function calculate_similarity($str1, $str2) {
        if (empty($str1) || empty($str2)) {
            return 0;
        }

        // Exact match
        if ($str1 === $str2) {
            return 1.0;
        }

        // Calculate individual scores
        $levenshtein = $this->normalized_levenshtein($str1, $str2);
        $jaro_winkler = $this->jaro_winkler($str1, $str2);
        $ngram = $this->ngram_similarity($str1, $str2, 2);

        // Weighted combination
        // Jaro-Winkler is best for short strings/typos
        // N-gram is good for partial matches
        // Levenshtein is good for transformations
        return ($levenshtein * 0.3) + ($jaro_winkler * 0.4) + ($ngram * 0.3);
    }

    /**
     * Normalized Levenshtein distance (0 to 1)
     */
    private function normalized_levenshtein($str1, $str2) {
        $distance = levenshtein($str1, $str2);
        $max_len = max(strlen($str1), strlen($str2));
        
        if ($max_len === 0) {
            return 1.0;
        }

        return 1 - ($distance / $max_len);
    }

    /**
     * Jaro-Winkler similarity algorithm
     * Better for short strings and detecting typos
     */
    private function jaro_winkler($str1, $str2, $prefix_scale = 0.1) {
        $jaro = $this->jaro($str1, $str2);
        
        // Calculate common prefix length (up to 4 characters)
        $prefix_length = 0;
        $max_prefix = min(4, min(strlen($str1), strlen($str2)));
        
        for ($i = 0; $i < $max_prefix; $i++) {
            if ($str1[$i] === $str2[$i]) {
                $prefix_length++;
            } else {
                break;
            }
        }

        return $jaro + ($prefix_length * $prefix_scale * (1 - $jaro));
    }

    /**
     * Jaro similarity algorithm
     */
    private function jaro($str1, $str2) {
        $len1 = strlen($str1);
        $len2 = strlen($str2);

        if ($len1 === 0 && $len2 === 0) {
            return 1.0;
        }

        if ($len1 === 0 || $len2 === 0) {
            return 0.0;
        }

        // Match distance
        $match_distance = max($len1, $len2) / 2 - 1;
        $match_distance = max(0, (int) floor($match_distance));

        $str1_matches = array_fill(0, $len1, false);
        $str2_matches = array_fill(0, $len2, false);

        $matches = 0;
        $transpositions = 0;

        // Find matches
        for ($i = 0; $i < $len1; $i++) {
            $start = max(0, $i - $match_distance);
            $end = min($i + $match_distance + 1, $len2);

            for ($j = $start; $j < $end; $j++) {
                if ($str2_matches[$j] || $str1[$i] !== $str2[$j]) {
                    continue;
                }
                $str1_matches[$i] = true;
                $str2_matches[$j] = true;
                $matches++;
                break;
            }
        }

        if ($matches === 0) {
            return 0.0;
        }

        // Count transpositions
        $k = 0;
        for ($i = 0; $i < $len1; $i++) {
            if (!$str1_matches[$i]) {
                continue;
            }
            while (!$str2_matches[$k]) {
                $k++;
            }
            if ($str1[$i] !== $str2[$k]) {
                $transpositions++;
            }
            $k++;
        }

        $transpositions = $transpositions / 2;

        return (
            ($matches / $len1) +
            ($matches / $len2) +
            (($matches - $transpositions) / $matches)
        ) / 3;
    }

    /**
     * N-gram similarity
     * Good for partial matches and word-level similarity
     */
    private function ngram_similarity($str1, $str2, $n = 2) {
        $ngrams1 = $this->get_ngrams($str1, $n);
        $ngrams2 = $this->get_ngrams($str2, $n);

        if (empty($ngrams1) || empty($ngrams2)) {
            return 0;
        }

        $intersection = array_intersect($ngrams1, $ngrams2);
        $union_count = count($ngrams1) + count($ngrams2) - count($intersection);

        if ($union_count === 0) {
            return 0;
        }

        return count($intersection) / $union_count;
    }

    /**
     * Get n-grams from a string
     */
    private function get_ngrams($str, $n) {
        $str = trim($str);
        $len = strlen($str);
        
        if ($len < $n) {
            return [$str];
        }

        $ngrams = [];
        for ($i = 0; $i <= $len - $n; $i++) {
            $ngrams[] = substr($str, $i, $n);
        }

        return $ngrams;
    }

    /**
     * Batch process AI suggestions for multiple 404s
     */
    public function batch_suggest($urls, $save_to_db = true) {
        global $wpdb;
        $table = $wpdb->prefix . 'rankjet_404_logs';
        
        $results = [];

        foreach ($urls as $url_data) {
            $id = isset($url_data['id']) ? (int) $url_data['id'] : null;
            $url = $url_data['url'];
            
            $suggestions = $this->suggest_redirect($url);
            
            if (!empty($suggestions) && $save_to_db && $id) {
                $best = $suggestions[0];
                $wpdb->update(
                    $table,
                    [
                        'ai_suggestion' => $best['url'],
                        'ai_confidence' => $best['score'],
                    ],
                    ['id' => $id],
                    ['%s', '%f'],
                    ['%d']
                );
            }

            $results[$url] = $suggestions;
        }

        return $results;
    }
}
