<?php

namespace Limb_Chatbot\Includes\Services\Knowledge\Generators;

use Limb_Chatbot\Includes\Data_Objects\Dataset;
use Limb_Chatbot\Includes\Data_Objects\Dataset_Entry;
use Limb_Chatbot\Includes\Data_Objects\File;
use Limb_Chatbot\Includes\Exceptions\Error_Codes;
use Limb_Chatbot\Includes\Exceptions\Exception;
use Limb_Chatbot\Includes\Interfaces\Knowledge_Generator_Interface;
use Limb_Chatbot\Includes\Services\Collection;
use Limb_Chatbot\Includes\Services\Helper;
use Limb_Chatbot\Includes\Services\Knowledge\Content_Extractors\PDF_Content_Extractor;

/**
 * File Knowledge Generator
 *
 * Generates knowledge base entries from uploaded files using two approaches:
 * 1. For PDFs: Uses PDF parser and chunking approach (same as posts)
 * 2. For other files: Uses AI-based Q&A generation
 *
 * PDF Process Flow:
 * 1. Extracts text from PDF using pdfparser library
 * 2. Normalizes content to HTML-like format
 * 3. Chunks content using Heading_Aware_Chunker
 * 4. Creates chunk-based dataset entries
 *
 * Other Files Process Flow:
 * 1. Retrieves the file by UUID from the dataset source
 * 2. Creates a Knowledge_Utility with the dataset's model and config
 * 3. Generates system message for file-specific knowledge extraction
 * 4. Uses AI to analyze the file content and generate Q&A pairs
 * 5. Creates Dataset_Entry objects for each generated Q&A
 * 6. Saves all entries to the database
 *
 * @package Limb_Chatbot\Includes\Services\Knowledge\Generators
 * @since 1.0.0
 */
class File_Knowledge_Generator extends Post_Knowledge_Generator implements Knowledge_Generator_Interface {

	/**
	 * PDF content extractor for extracting text from PDF files.
	 *
	 * @var PDF_Content_Extractor
	 */
	protected PDF_Content_Extractor $pdf_content_extractor;

	/**
	 * Constructor to initialize dependencies.
	 *
	 * @since 1.0.0
	 */
	public function __construct() {
		parent::__construct();
		$this->pdf_content_extractor = new PDF_Content_Extractor();
	}

	/**
	 * Generates knowledge entries from an uploaded file.
	 *
	 * For PDF files, uses the chunking approach (same as posts).
	 * For other files, uses the AI-based Q&A generation approach.
	 *
	 * @param  Dataset  $dataset  The dataset containing the file UUID as source.
	 *
	 * @return Dataset The updated dataset with generated entries.
	 * @throws Exception If file not found, file not active, or generation fails.
	 * @since 1.0.0
	 */
	public function generate( Dataset $dataset ): Dataset {
		$file_uuid = $dataset->get_source();
		$file      = File::find_by_uuid( $file_uuid );

		if ( ! $file instanceof File ) {
			throw new Exception( Error_Codes::FILE_NOT_FOUND,
				sprintf( __( 'File with UUID %s not found', 'limb-chatbot' ), $file_uuid ) );
		}

		// Use chunking approach for PDFs
		if ( $file->get_mime_type() === 'application/pdf' ) {
			return $this->generate_from_pdf( $dataset, $file );
		}

		// Fall back to AI-based approach for other file types
		throw new Exception( Error_Codes::VALIDATION_INVALID_VALUE,
			__( 'Only PDF type is supported', 'limb-chatbot' ) );
	}

	/**
	 * Generate knowledge entries from PDF using chunking approach.
	 *
	 * Uses the same approach as Post_Knowledge_Generator:
	 * 1. Extract and normalize PDF content
	 * 2. Chunk content with heading awareness
	 * 3. Process chunks into dataset entries
	 *
	 * @param  Dataset  $dataset  Dataset to generate entries for.
	 * @param  File  $file  PDF file object.
	 *
	 * @return Dataset Updated dataset with generated entries.
	 * @throws Exception If generation fails.
	 * @since 1.0.0
	 */
	protected function generate_from_pdf( Dataset $dataset, File $file ): Dataset {
		// Step 1: Extract PDF content (returns HTML-like structure)
		$pdf_content = $this->pdf_content_extractor->extract( $file );
		if ( empty( $pdf_content ) ) {
			$exception = new Exception(
				Error_Codes::EMPTY_VALUE,
				__( 'No content found in ', 'limb-chatbot' ),
				[ 'warning' => true ]
			);
			$exception->attach_link( $file->get_url(), $file->get_original_name() );
			throw $exception;
		}

		// Step 2: Normalize through Content_Extractor (same as posts)
		// This ensures images, links, and other HTML elements are properly processed
		$normalized_content = $this->content_extractor->normalize( $pdf_content );
		
		if ( empty( $normalized_content ) ) {
			throw new Exception(
				Error_Codes::EMPTY_VALUE,
				__( 'No content found after normalization', 'limb-chatbot' )
			);
		}

		// Update the dataset meta to enable edition in future
		// Clean HTML to only keep allowed tags without attributes
		$cleaned_content = $this->clean_html_for_source_content( $normalized_content );
		$dataset->update_meta( 'source_content', $cleaned_content );

		// Step 3: Chunk content with heading awareness (same as posts)
		$title  = $dataset->get_name() ?: $file->get_original_name();
		$chunks = $this->chunker->chunk( $normalized_content, $title );

		if ( empty( $chunks ) ) {
			throw new Exception(
				Error_Codes::EMPTY_VALUE,
				__( 'Failed to chunk PDF content into segments', 'limb-chatbot' )
			);
		}

		// Step 4: Process chunks into dataset entries (same as posts)
		$this->process_chunks_for_file( $dataset, $chunks, $title );

		return $dataset;
	}

	/**
	 * Process chunks into dataset entries for files.
	 *
	 * Similar to process_chunks but works with File objects instead of WP_Post.
	 * Creates chunk-based entries from extracted file content.
	 *
	 * @param  Dataset  $dataset  Dataset to store entries.
	 * @param  array  $chunks  Array of chunk data.
	 * @param  string  $title  Document title.
	 *
	 * @throws Exception If processing fails.
	 * @since 1.0.0
	 */
	protected function process_chunks_for_file(
		Dataset $dataset,
		array $chunks,
		string $title = ''
	): void {
		$all_entries = new Collection();
		$source_url = $dataset->source_url() ?? '';
		$source_type = $dataset->get_source_type();

		// Build chunk-based entries with comprehensive metadata
		$chunk_index = 0;
		foreach ( $chunks as $chunk ) {
			try {
				// Enrich chunk with source metadata for better context
				if ( ! empty( $title ) ) {
					$chunk['post_title'] = $title;
				}
				$chunk['source_url'] = $source_url;
				$chunk['source_type'] = $source_type;
				$chunk['chunk_index'] = $chunk_index++;
				
				$entry = $this->entry_builder->build( $chunk );
				$entry->set_dataset_id( $dataset->get_id() );
				$all_entries->push_item( $entry );
			} catch ( \Exception $e ) {
				// Log individual entry build errors but don't fail the whole process
				Helper::log( [
					'error'      => 'Failed to build chunk entry',
					'dataset_id' => $dataset->get_id(),
					'chunk_data' => $chunk,
					'exception'  => $e->getMessage()
				], __METHOD__ );
			}
		}

		// Save all entries to the dataset
		if ( ! $all_entries->is_empty() ) {
			$all_entries->each( function ( Dataset_Entry $entry ) use ( $dataset ) {
				try {
					$entry->save();
				} catch ( \Exception $e ) {
					// Log individual entry save errors but don't fail the whole process
					Helper::log( [
						'error'      => 'Failed to save dataset entry',
						'dataset_id' => $dataset->get_id(),
						'entry_data' => $entry->get_entry(),
						'exception'  => $e->getMessage()
					], __METHOD__ );
				}
			} );
		} else {
			throw new Exception(
				Error_Codes::EMPTY_VALUE,
				__( 'No valid entries were generated from given source', 'limb-chatbot' )
			);
		}
	}
}