<?php

namespace Limb_Chatbot\Includes\Services\Knowledge\Generators;

use Limb_Chatbot\Includes\Data_Objects\Dataset;
use Limb_Chatbot\Includes\Data_Objects\Dataset_Entry;
use Limb_Chatbot\Includes\Exceptions\Error_Codes;
use Limb_Chatbot\Includes\Exceptions\Exception;
use Limb_Chatbot\Includes\Interfaces\Knowledge_Generator_Interface;
use Limb_Chatbot\Includes\Services\Collection;
use Limb_Chatbot\Includes\Services\Helper;
use Limb_Chatbot\Includes\Services\Knowledge\Chunkers\Heading_Aware_Chunker;
use Limb_Chatbot\Includes\Services\Knowledge\Content_Extractors\URL_Content_Extractor;
use Limb_Chatbot\Includes\Services\Knowledge\Builders\Chunk_Entry_Builder;
use Limb_Chatbot\Includes\Services\Knowledge\Knowledge_Mappers\URL_Knowledge_Mapper;

/**
 * URL Knowledge Generator using chunk-based approach.
 *
 * Generates knowledge entries from URL sources using:
 * 1. Content extraction from URLs using wp_remote_get
 * 2. Content normalization
 * 3. Heading-aware chunking
 * 4. Chunk-based dataset entries
 *
 * @since 1.0.0
 */
class URL_Knowledge_Generator implements Knowledge_Generator_Interface {

	/**
	 * URL content extractor for fetching and extracting content.
	 *
	 * @var URL_Content_Extractor
	 */
	protected URL_Content_Extractor $content_extractor;

	/**
	 * Heading-aware chunker for segmenting content.
	 *
	 * @var Heading_Aware_Chunker
	 */
	protected Heading_Aware_Chunker $chunker;

	/**
	 * Chunk entry builder for creating dataset entries.
	 *
	 * @var Chunk_Entry_Builder
	 */
	protected Chunk_Entry_Builder $entry_builder;

	/**
	 * Constructor to initialize dependencies.
	 *
	 * @since 1.0.0
	 */
	public function __construct() {
		$this->content_extractor = new URL_Content_Extractor();
		$this->chunker           = new Heading_Aware_Chunker();
		$this->entry_builder     = new Chunk_Entry_Builder();
	}

	/**
	 * Generate knowledge entries from a dataset source.
	 *
	 * @param  Dataset  $dataset  Dataset to generate entries for.
	 *
	 * @return Dataset Updated dataset with generated entries.
	 * @throws Exception If generation fails.
	 * @since 1.0.0
	 */
	public function generate( Dataset $dataset ): Dataset {
		// Get URL from source field (each dataset represents one URL)
		// The source field contains the URL string for this specific dataset
		$url    = $dataset->get_source() ?: $dataset->get_meta_value( 'url' );
		$mapper = new URL_Knowledge_Mapper();

		if ( empty( $url ) || ! is_string( $url ) ) {
			throw new Exception(
				Error_Codes::VALIDATION_INVALID_VALUE,
				__( 'Source must be a valid URL string', 'limb-chatbot' )
			);
		}

		// Validate URL format
		if ( ! filter_var( $url, FILTER_VALIDATE_URL ) ) {
			throw new Exception(
				Error_Codes::VALIDATION_INVALID_VALUE,
				__( 'Invalid URL format', 'limb-chatbot' )
			);
		}

		// Step 1: Extract and normalize content from URL
		$extracted = $this->content_extractor->extract( $url );

		if ( empty( $extracted['content'] ) || strlen( trim( strip_tags( $extracted['content'] ) ) ) < 100 ) {
			$exception = new Exception(
				Error_Codes::EMPTY_VALUE,
				__( 'No content extracted from URL', 'limb-chatbot' ),
				[ 'warning' => true ]
			);
			$exception->attach_link( $url, $mapper->get_entry_input( $url ) );
			throw $exception;
		}

		// Optional: Improve title fallback
		$title = ! empty( $extracted['title'] ) ? $extracted['title'] : wp_parse_url( $url, PHP_URL_HOST );

		// Clean source_content more strictly for preview
		$cleaned_content = $this->clean_html_for_source_content( $extracted['content'] );
		$dataset->update_meta( 'source_content', $cleaned_content );
		$dataset->set_name( $title );
		$dataset->save();

		// Step 2: Chunk content with heading awareness
		$normalized_content = $extracted['content'];
		$chunks             = $this->chunker->chunk( $normalized_content, $title );

		if ( empty( $chunks ) ) {
			throw new Exception(
				Error_Codes::EMPTY_VALUE,
				__( 'Failed to chunk content into segments', 'limb-chatbot' )
			);
		}

		// Step 3: Process chunks into dataset entries
		$this->process_chunks( $dataset, $chunks, $normalized_content, $title, $url );

		return $dataset;
	}

	/**
	 * Clean HTML content for source_content meta.
	 *
	 * Removes all attributes, styles, classes, and non-allowed tags.
	 * Keeps only clean textual HTML from WordPress WYSIWYG-supported elements.
	 *
	 * @param  string  $html  Raw HTML content.
	 *
	 * @return string Clean HTML with only allowed tags and no attributes.
	 * @since 1.0.0
	 */
	protected function clean_html_for_source_content( string $html ): string {
		// Define allowed tags for WordPress WYSIWYG editor
		$allowed_tags = [
			'a'      => [ 'href' => [] ],
			'p'      => [],
			'h1'     => [],
			'h2'     => [],
			'h3'     => [],
			'h4'     => [],
			'h5'     => [],
			'h6'     => [],
			'span'   => [],
			'strong' => [],
			'b'      => [],
			'em'     => [],
			'i'      => [],
			'img'    => [ 'src' => [], 'alt' => [] ],
			'table'  => [],
			'thead'  => [],
			'tbody'  => [],
			'tr'     => [],
			'td'     => [],
			'th'     => [],
			'ul'     => [],
			'ol'     => [],
			'li'     => [],
			'br'     => [],
		];

		// Use wp_kses to strip all attributes except those specified
		$cleaned = wp_kses( $html, $allowed_tags );

		// Clean up any remaining empty tags
		$cleaned = preg_replace( '/<(p|span|strong|b|em|i|a)[^>]*>\s*<\/\1>/is', '', $cleaned );

		// Normalize whitespace
		$cleaned = preg_replace( '/\s+/u', ' ', $cleaned );
		$cleaned = preg_replace( '/>\s+</', '><', $cleaned );

		// Restore spacing around block elements for readability
		$cleaned = preg_replace( '/><(h[1-6]|p|ul|ol|li|table|thead|tbody|tr)/i', '> <$1', $cleaned );
		$cleaned = preg_replace( '/(<\/h[1-6]|<\/p|<\/ul|<\/ol|<\/li|<\/table|<\/thead|<\/tbody|<\/tr)></i',
			'$1> <',
			$cleaned );

		return trim( $cleaned );
	}

	/**
	 * Remove div tags while preserving their inner content.
	 *
	 * Properly handles nested divs by iteratively processing from innermost to outermost.
	 *
	 * @param  string  $html  HTML content with div tags.
	 *
	 * @return string HTML with div tags removed but content preserved.
	 * @since 1.0.0
	 */
	private function remove_div_tags_preserve_content( string $html ): string {
		$max_iterations = 50;
		$iteration      = 0;

		while ( preg_match( '/<div[^>]*>/i', $html ) && $iteration < $max_iterations ) {
			$previous_html = $html;

			$html = preg_replace_callback(
				'/<div[^>]*>((?:(?!<div[^>]*>|<\/div>).)*?)<\/div>/is',
				function ( $matches ) {
					$inner_content = trim( $matches[1] );

					if ( empty( $inner_content ) ) {
						return '';
					}

					if ( preg_match( '/^<(h[1-6]|p|ul|ol|table|thead|tbody|tr|li)/i', $inner_content ) ) {
						return $inner_content;
					}

					return '<p>' . $inner_content . '</p>';
				},
				$html
			);

			if ( $html === $previous_html ) {
				break;
			}

			$iteration ++;
		}

		$html = preg_replace_callback(
			'/<div[^>]*>(.*)/is',
			function ( $matches ) {
				$inner_content = preg_replace( '/<\/div>/i', '', $matches[1] );
				$inner_content = trim( $inner_content );

				if ( ! empty( $inner_content ) ) {
					if ( ! preg_match( '/^<(h[1-6]|p|ul|ol|table|thead|tbody|tr|li)/i', $inner_content ) ) {
						return '<p>' . $inner_content . '</p>';
					}

					return $inner_content;
				}

				return '';
			},
			$html
		);

		$html = preg_replace( '/<div[^>]*>/i', '', $html );
		$html = preg_replace( '/<\/div>/i', '', $html );

		return $html;
	}

	/**
	 * Process chunks into dataset entries for URL content.
	 *
	 * Creates chunk-based entries from extracted URL content.
	 *
	 * @param  Dataset  $dataset  Dataset to store entries.
	 * @param  array  $chunks  Array of chunk data.
	 * @param  string  $normalized_content  Full normalized HTML content.
	 * @param  string  $title  Document title.
	 * @param  string  $url  Source URL.
	 *
	 * @throws Exception If processing fails.
	 * @since 1.0.0
	 */
	protected function process_chunks(
		Dataset $dataset,
		array $chunks,
		string $normalized_content,
		string $title,
		string $url
	): void {
		$all_entries = new Collection();
		$source_url  = $url;
		$source_type = $dataset->get_source_type();

		// Build chunk-based entries with comprehensive metadata
		$chunk_index = 0;
		foreach ( $chunks as $chunk ) {
			try {
				// Enrich chunk with source metadata for better context
				$chunk['post_title']  = $title;
				$chunk['source_url']  = $source_url;
				$chunk['source_type'] = $source_type;
				$chunk['chunk_index'] = $chunk_index ++;

				$entry = $this->entry_builder->build( $chunk );
				$entry->set_dataset_id( $dataset->get_id() );
				$all_entries->push_item( $entry );
			} catch ( \Exception $e ) {
				// Log individual entry build errors but don't fail the whole process
				Helper::log( [
					'error'      => 'Failed to build chunk entry',
					'dataset_id' => $dataset->get_id(),
					'chunk_data' => $chunk,
					'exception'  => $e->getMessage()
				], __METHOD__ );
			}
		}

		// Save all entries to the dataset
		if ( ! $all_entries->is_empty() ) {
			$all_entries->each( function ( Dataset_Entry $entry ) use ( $dataset ) {
				try {
					$entry->save();
				} catch ( \Exception $e ) {
					// Log individual entry save errors but don't fail the whole process
					Helper::log( [
						'error'      => 'Failed to save dataset entry',
						'dataset_id' => $dataset->get_id(),
						'entry_data' => $entry->get_entry(),
						'exception'  => $e->getMessage()
					], __METHOD__ );
				}
			} );
		} else {
			throw new Exception(
				Error_Codes::EMPTY_VALUE,
				__( 'No valid entries were generated from given source', 'limb-chatbot' )
			);
		}
	}
}

