<?php

namespace Limb_Chatbot\Includes\Services;

use Limb_Chatbot\Includes\Data_Objects\Dataset;
use Limb_Chatbot\Includes\Data_Objects\Segment;

/**
 * Class Semantic_Segmenter
 *
 * Splits content into semantic segments and groups them into contextual clusters.
 * Optimized for performance with caching and efficient processing.
 *
 * @package Limb_Chatbot\Includes\Services
 * @since 1.0.0
 */
class Semantic_Segmenter {

	/**
	 * Cache for processed content to avoid reprocessing.
	 *
	 * @var array
	 */
	private array $content_cache = [];

	/**
	 * Segment and group content into contextual clusters.
	 *
	 * @param  string  $raw_html_or_text  The raw HTML or plain text content.
	 * @param  Dataset  $dataset
	 * @param  int  $max_tokens_per_cluster  Max tokens per cluster (default: 2500).
	 *
	 * @return array Array of clusters, each containing Segment objects.
	 */
	public function segment(
		string $raw_html_or_text,
		Dataset $dataset,
		int $max_tokens_per_cluster = 2500
	): array {
		// Cache key based on content hash and dataset
		$cache_key = md5( $raw_html_or_text . $dataset->get_id() . $max_tokens_per_cluster );
		
		if ( isset( $this->content_cache[ $cache_key ] ) ) {
			return $this->content_cache[ $cache_key ];
		}

		$segments = $this->is_html( $raw_html_or_text )
			? $this->segment_html( $raw_html_or_text, $dataset )
			: $this->segment_text( $raw_html_or_text, $dataset );

		$clusters = $this->group_into_clusters( $segments, $max_tokens_per_cluster );
		
		// Cache the result
		$this->content_cache[ $cache_key ] = $clusters;
		
		return $clusters;
	}

	/**
	 * Check if the content is HTML.
	 *
	 * @param  string  $content
	 *
	 * @return bool
	 */
	private function is_html( string $content ): bool {
		return $content !== strip_tags( $content );
	}

	/**
	 * Segment HTML content by extracting clean text from headings and paragraphs.
	 *
	 * @param  string  $html
	 * @param  Dataset  $dataset
	 *
	 * @return Segment[]
	 */
	private function segment_html( string $html, Dataset $dataset ): array {
		$segments = [];
		
		// First, clean the HTML to remove scripts, styles, and other non-content elements
		$content = $this->clean_html_content( $html );
		
		// Extract headings and their content
		$current_h2 = null;
		$current_h3 = null;
		$order = 0;
		
		// Split content by headings to get sections
		$sections = preg_split( '/(<h[1-6][^>]*>.*?<\/h[1-6]>)/is', $content, -1, PREG_SPLIT_DELIM_CAPTURE );
		
		foreach ( $sections as $section ) {
			$section = trim( $section );
			if ( empty( $section ) ) continue;
			
			// Check if this is a heading
			if ( preg_match( '/<h([1-6])[^>]*>(.*?)<\/h[1-6]>/is', $section, $heading_match ) ) {
				$level = (int) $heading_match[1];
				$heading_text = $this->extract_clean_text( $heading_match[2] );
				
				if ( $level <= 2 ) {
					$current_h2 = $heading_text;
					$current_h3 = null; // Reset h3 when new h2
				} elseif ( $level === 3 ) {
					$current_h3 = $heading_text;
				}
			} else {
				// This is content, extract clean text
				$clean_text = $this->extract_clean_text( $section );
				if ( empty( $clean_text ) || strlen( $clean_text ) < 10 ) {
					continue; // Skip empty or very short content
				}
				
				$segments[] = new Segment( [
					'source_type'     => $dataset->get_source_type(),
					'source_sub_type' => $dataset->get_source_sub_type(),
					'source_id'       => $dataset->get_source(),
					'title'           => $dataset->get_name(),
					'heading'         => $current_h2 ?? $current_h3,
					'content'         => $this->normalize_whitespace( $clean_text ),
					'order'           => $order++,
				] );
			}
		}
		
		return $segments;
	}

	/**
	 * Clean HTML content by removing scripts, styles, and other non-content elements.
	 *
	 * @param  string  $html
	 *
	 * @return string
	 */
	private function clean_html_content( string $html ): string {
		// Remove script and style elements completely
		$html = preg_replace( '/<script[^>]*>.*?<\/script>/is', '', $html );
		$html = preg_replace( '/<style[^>]*>.*?<\/style>/is', '', $html );
		$html = preg_replace( '/<noscript[^>]*>.*?<\/noscript>/is', '', $html );
		
		// Remove comments
		$html = preg_replace( '/<!--.*?-->/s', '', $html );
		
		// Remove empty elements that don't contain text
		return preg_replace( '/<(div|span|p)[^>]*>\s*<\/\1>/is', '', $html );
	}

	/**
	 * Extract clean text content from HTML, removing all tags and attributes.
	 *
	 * @param  string  $html
	 *
	 * @return string
	 */
	private function extract_clean_text( string $html ): string {
		// --- Preserve links (convert to Markdown-style or text (url)) ---
		$html = preg_replace_callback(
			'/<a\s+[^>]*href=["\']([^"\']+)["\'][^>]*>(.*?)<\/a>/is',
			function ( $matches ) {
				$url  = trim( $matches[1] );
				$text = strip_tags( $matches[2] );

				return '[' . $text . '](' . $url . ')'; // Markdown-style
			},
			$html
		);

		// --- Preserve <strong>, <b>, <em>, <i>, <code>, <pre> formatting ---
		// Convert them to Markdown equivalents
		$replacements = [
			'/<\s*(strong|b)[^>]*>(.*?)<\s*\/\1\s*>/is' => '**$2**', // bold
			'/<\s*(em|i)[^>]*>(.*?)<\s*\/\1\s*>/is'     => '_$2_',   // italic
			'/<\s*code[^>]*>(.*?)<\s*\/code\s*>/is'     => '`$1`',   // inline code
			'/<\s*pre[^>]*>(.*?)<\s*\/pre\s*>/is'       => "\n```\n$1\n```\n", // code block
			'/<\s*br\s*\/?>/i'                          => "\n",     // line break
		];
		foreach ( $replacements as $pattern => $replacement ) {
			$html = preg_replace( $pattern, $replacement, $html );
		}

		// --- Handle basic list tags for readability ---
		$html = preg_replace( '/<\s*li[^>]*>(.*?)<\s*\/li\s*>/is', "- $1\n", $html );
		$html = preg_replace( '/<\s*ul[^>]*>|<\s*\/ul\s*>/i', "\n", $html );

		// --- Remove all other HTML tags ---
		$text = strip_tags( $html );

		// --- Decode HTML entities ---
		$text = html_entity_decode( $text, ENT_QUOTES | ENT_HTML5, 'UTF-8' );

		// --- Normalize whitespace ---
		$text = $this->normalize_whitespace( $text );

		return trim( $text );
	}

	/**
	 * Optimized whitespace normalization with single regex.
	 *
	 * @param  string  $text
	 *
	 * @return string
	 */
	private function normalize_whitespace( string $text ): string {
		// Single regex instead of multiple operations
		return trim( preg_replace( '/\s+/u', ' ', $text ) );
	}

	/**
	 * Segment plain text using paragraph breaks (\n\n).
	 *
	 * @param  string  $text
	 * @param  Dataset  $dataset
	 *
	 * @return Segment[]
	 */
	private function segment_text( string $text, Dataset $dataset ): array {
		$segments = [];
		$paras = preg_split( '/\n{2,}/', trim( $text ) );
		$order = 0;

		foreach ( $paras as $para ) {
			$para = $this->normalize_whitespace( $para );
			if ( $para === '' || strlen( $para ) < 10 ) {
				continue; // Skip empty or very short paragraphs
			}

			$segments[] = new Segment( [
				'source_type'     => $dataset->get_source_type(),
				'source_sub_type' => $dataset->get_source_sub_type(),
				'source_id'       => $dataset->get_source(),
				'title'           => $dataset->get_name(),
				'heading'         => null,
				'content'         => $para,
				'order'           => $order++,
			] );
		}

		return $segments;
	}

	/**
	 * Group segments into clusters with optimized token calculation.
	 *
	 * @param  Segment[]  $segments  Array of Segment objects.
	 * @param  int  $max_tokens_per_cluster  Max tokens per cluster.
	 *
	 * @return array Array of clusters, each containing Segment objects.
	 */
	private function group_into_clusters( array $segments, int $max_tokens_per_cluster ): array {
		$clusters = [];
		$current_cluster = [];
		$current_token_count = 0;

		foreach ( $segments as $segment ) {
			// More accurate token estimation
			$segment_tokens = $this->estimate_tokens( $segment->content ) + 
							 $this->estimate_tokens( $segment->heading ?? '' );

			// Start a new cluster if adding this segment exceeds token limit
			if ( $current_token_count + $segment_tokens > $max_tokens_per_cluster && ! empty( $current_cluster ) ) {
				$clusters[] = $current_cluster;
				$current_cluster = [];
				$current_token_count = 0;
			}

			// Add segment to current cluster
			$current_cluster[] = $segment;
			$current_token_count += $segment_tokens;

			// Group by h2 heading changes (if present)
			if ( $segment->heading && preg_match( '/^h2:/i', $segment->heading ) ) {
				// Ensure next segment starts a new cluster
				if ( ! empty( $current_cluster ) ) {
					$clusters[] = $current_cluster;
					$current_cluster = [];
					$current_token_count = 0;
				}
			}
		}

		// Add the last cluster if not empty
		if ( ! empty( $current_cluster ) ) {
			$clusters[] = $current_cluster;
		}

		return $clusters;
	}

	/**
	 * More accurate token estimation.
	 *
	 * @param  string  $text
	 *
	 * @return int
	 */
	private function estimate_tokens( string $text ): int {
		if ( empty( $text ) ) {
			return 0;
		}
		
		// More accurate token estimation
		$words = str_word_count( $text );
		$chars = strlen( $text );
		
		// Rough estimation: 1 token ≈ 0.75 words or 4 characters
		return max( (int) ( $words * 1.33 ), (int) ( $chars / 4 ) );
	}

	/**
	 * Clear the content cache to free memory.
	 *
	 * @return void
	 */
	public function clear_cache(): void {
		$this->content_cache = [];
	}
}