<?php

namespace Limb_Chatbot\Includes\Services\Jobs\Handlers;

use Limb_Chatbot\Includes\Data_Objects\AI_Model;
use Limb_Chatbot\Includes\Data_Objects\Chatbot;
use Limb_Chatbot\Includes\Data_Objects\Config;
use Limb_Chatbot\Includes\Data_Objects\Dataset;
use Limb_Chatbot\Includes\Data_Objects\Dataset_Entry;
use Limb_Chatbot\Includes\Data_Objects\Job;
use Limb_Chatbot\Includes\Data_Objects\Task;
use Limb_Chatbot\Includes\Data_Objects\Vector_Index;
use Limb_Chatbot\Includes\Exceptions\Error_Codes;
use Limb_Chatbot\Includes\Exceptions\Exception;
use Limb_Chatbot\Includes\Interfaces\Multitask_Handler_Interface;
use Limb_Chatbot\Includes\Services\Dataset_Service;
use Limb_Chatbot\Includes\Services\Job\Abstract_Job_Handler;
use Limb_Chatbot\Includes\Services\Knowledge\Indexing_Service;
use Limb_Chatbot\Includes\Services\Knowledge\Knowledge_Generator;
use Limb_Chatbot\Includes\Vector_Dbs\Local\Local;
use Limb_Chatbot\Includes\Vector_Dbs\Pinecone\Pinecone;

/**
 * Dataset Sync Job Handler
 *
 * Handles dataset synchronization jobs. Migrated from WP_Background_Process system.
 * Syncs datasets by clearing existing entries, regenerating knowledge, and indexing entries.
 * Implements multitask pattern: parent tasks generate dataset entries, child tasks index them.
 *
 * @since 1.1.0
 */
class Dataset_Sync extends Abstract_Job_Handler implements Multitask_Handler_Interface {

	/**
	 * Dataset service instance.
	 *
	 * @var Dataset_Service
	 * @since 1.1.0
	 */
	private Dataset_Service $dataset_service;

	/**
	 * Batch size for generating child tasks.
	 *
	 * @var int
	 * @since 1.1.0
	 */
	public int $child_task_batch_size = 100;

	/**
	 * Constructor.
	 *
	 * @since 1.1.0
	 */
	public function __construct() {
		parent::__construct();
		$this->dataset_service = new Dataset_Service();
	}

	/**
	 * Get the job type this handler manages.
	 *
	 * @return string
	 * @since 1.1.0
	 */
	public function get_job_type(): string {
		return Job::TYPE_DATASET_SYNC;
	}

	/**
	 * Validate job configuration.
	 *
	 * Validates that dataset IDs are provided, exist, and have valid indexing configuration.
	 * Extracted from Handler::validate_process() and Handler::add_indexing_settings()
	 * Supports 'all' for syncing all datasets of a specific type.
	 *
	 * @param  array  $config  Job configuration.
	 * @param  string|null  $chatbot_uuid  Chatbot UUID (optional).
	 *
	 * @return bool True if valid.
	 * @throws Exception If validation fails.
	 * @since 1.1.0
	 */
	public function validate( array $config, ?string $chatbot_uuid = null ): bool {
		$dataset_ids = $config['dataset_ids'] ?? [];

		// Validate dataset IDs are provided
		if ( empty( $dataset_ids ) || ( ! is_array( $dataset_ids ) && $dataset_ids != 'all' ) ) {
			throw new Exception(
				Error_Codes::EMPTY_VALUE,
				__( 'Dataset IDs are required for dataset sync.', 'limb-chatbot' )
			);
		}

		// Handle 'all' case - skip detailed validation as datasets will be validated during processing
		if ( $dataset_ids == 'all' ) {
			// Validate chatbot if provided
			if ( ! empty( $chatbot_uuid ) && $chatbot_uuid !== Job::CHATBOT_DEFAULT ) {
				$chatbot = Chatbot::find_by_uuid( $chatbot_uuid );
				if ( empty( $chatbot ) ) {
					throw new Exception(
						Error_Codes::VALIDATION_INVALID_VALUE,
						__( 'The specified chatbot does not exist.', 'limb-chatbot' )
					);
				}
			}

			return true;
		}

		// Validate datasets exist
		$datasets = Dataset::where( [ 'id' => $dataset_ids ] );
		if ( $datasets->is_empty() ) {
			throw new Exception(
				Error_Codes::NOT_FOUND,
				__( 'Datasets not found', 'limb-chatbot' )
			);
		}

		// Extract and validate indexing configuration from first dataset
		$dataset = $datasets->first();

		// Get indexing configuration from dataset metadata
		$dimension_meta         = $dataset->get_meta( 'dimension' );
		$dimension              = $dimension_meta ? $dimension_meta->get_meta_value() : null;
		$indexing_ai_model_meta = $dataset->get_meta( 'index_ai_model_id' );
		$indexing_ai_model_id   = $indexing_ai_model_meta ? $indexing_ai_model_meta->get_meta_value() : null;
		$indexing_config_meta   = $dataset->get_meta( 'index_config_id' );
		$indexing_config_id     = $indexing_config_meta ? $indexing_config_meta->get_meta_value() : null;
		$vector_index_meta      = $dataset->get_meta( 'vector_index_id' );
		$vector_index_id        = $vector_index_meta ? $vector_index_meta->get_meta_value() : null;

		// Validate dimension
		if ( empty( $dimension ) ) {
			throw new Exception(
				Error_Codes::VALIDATION_INVALID_VALUE,
				__( 'Vector dimension is required.', 'limb-chatbot' )
			);
		}

		// Validate AI model
		if ( empty( $indexing_ai_model_id ) || ! AI_Model::find( $indexing_ai_model_id ) ) {
			throw new Exception(
				Error_Codes::AI_MODEL_NOT_SET,
				__( 'Invalid indexing AI model', 'limb-chatbot' )
			);
		}

		// Validate config
		if ( empty( $indexing_config_id ) || ! Config::find( $indexing_config_id ) ) {
			throw new Exception(
				Error_Codes::NOT_FOUND,
				__( 'Invalid indexing config', 'limb-chatbot' )
			);
		}

		// Validate vector index
		if ( empty( $vector_index_id ) || ! Vector_Index::find( $vector_index_id ) ) {
			throw new Exception(
				Error_Codes::NOT_FOUND,
				__( 'Invalid vector index', 'limb-chatbot' )
			);
		}

		// Validate chatbot if provided
		if ( ! empty( $chatbot_uuid ) && $chatbot_uuid !== Job::CHATBOT_DEFAULT ) {
			$chatbot = Chatbot::find_by_uuid( $chatbot_uuid );
			if ( empty( $chatbot ) ) {
				throw new Exception(
					Error_Codes::VALIDATION_INVALID_VALUE,
					__( 'The specified chatbot does not exist.', 'limb-chatbot' )
				);
			}
		}

		// Validate vector index type (get from vector index)
		$vector_index = Vector_Index::find( $vector_index_id );
		if ( ! $vector_index ) {
			throw new Exception(
				Error_Codes::VALIDATION_INVALID_VALUE,
				__( 'Storage for the dataset does not exist.', 'limb-chatbot' )
			);
		}

		return true;
	}

	/**
	 * Get total number of datasets to sync.
	 *
	 * Calculates total without actually fetching datasets.
	 * Essential for large sync operations to prevent timeouts.
	 *
	 * @param  array  $config  Job configuration.
	 * @param  Job  $job  Job instance.
	 *
	 * @return int Total task count.
	 * @throws Exception If calculation fails.
	 * @since 1.1.0
	 */
	public function get_total( array $config, Job $job ): int {
		$dataset_ids = $config['dataset_ids'] ?? [];

		if ( $dataset_ids == 'all' ) {
			$type  = $job->get_sub_type() == 'informational' ? Dataset::TYPE_INFORMATIONAL_KNOWLEDGE : Dataset::TYPE_ACTIONABLE_KNOWLEDGE;
			$total = Dataset::count( [ 'type' => $type ] );
		} else {
			if ( empty( $dataset_ids ) || ! is_array( $dataset_ids ) ) {
				throw new Exception(
					Error_Codes::EMPTY_VALUE,
					__( 'There is no dataset to sync', 'limb-chatbot' )
				);
			}

			$total = count( $dataset_ids );
		}

		if ( empty( $total ) ) {
			throw new Exception(
				Error_Codes::EMPTY_VALUE,
				__( 'Total is empty', 'limb-chatbot' )
			);
		}

		return (int) $total;
	}

	/**
	 * Generate a batch of tasks for dataset sync.
	 *
	 * Creates one task per dataset ID. Each task will sync a single dataset.
	 * Also sets datasets to PENDING status before processing.
	 * Supports 'all' to sync all datasets of a specific type.
	 *
	 * @param  Job  $job  Job instance.
	 * @param  array  $config  Job configuration.
	 * @param  int  $offset  Starting offset for this batch.
	 * @param  int  $limit  Maximum number of tasks to generate.
	 *
	 * @return int Number of tasks actually created.
	 * @since 1.1.0
	 */
	public function generate_task_batch( Job $job, array $config, int $offset, int $limit ): int {
		$dataset_ids = $config['dataset_ids'] ?? [];

		if ( $dataset_ids == 'all' ) {
			$type        = $job->get_sub_type() == 'informational' ? Dataset::TYPE_INFORMATIONAL_KNOWLEDGE : Dataset::TYPE_ACTIONABLE_KNOWLEDGE;
			$dataset_ids = Dataset::where( [ 'type' => $type ], -1, -1 )->pluck( 'id' );
		}

		if ( empty( $dataset_ids ) || ! is_array( $dataset_ids ) ) {
			return 0;
		}

		// Get the slice of dataset IDs for this batch
		$batch_ids = array_slice( $dataset_ids, $offset, $limit );

		// Set datasets to PENDING status before processing
		$datasets = Dataset::where( [ 'id' => $batch_ids ] );
		foreach ( $datasets as $dataset ) {
			$dataset->set_status( Dataset::STATUS_PENDING );
			$dataset->save();
		}

		$task_count = 0;

		foreach ( $batch_ids as $dataset_id ) {
			// Create task payload with dataset ID
			$payload = [
				'dataset_id' => (int) $dataset_id,
			];

			// Create task
			if ( $this->create_task( $job->get_id(), $payload ) ) {
				$task_count ++;
			}
		}

		return $task_count;
	}

	/**
	 * Process a single task.
	 *
	 * If task has a parent (child task), it's an indexing task.
	 * Otherwise, it's a parent task that will generate sub-tasks.
	 *
	 * @param  Task  $task  Task to process.
	 *
	 * @return bool True on success.
	 * @throws Exception If processing fails.
	 * @since 1.1.0
	 */
	public function process_task( Task $task ): bool {
		// Check if this is a child task (indexing task)
		if ( $task->get_parent_task_id() ) {
			// This is a child task - index the entry
			$indexing_service = new Indexing_Service();

			return $indexing_service->index_entry( $task );
		}

		// This is a parent task - generate sub-tasks (entries)
		// The parent task processing is handled by generate_sub_tasks
		// which is called by the multitask processor
		return true;
	}

	/**
	 * Generate sub-tasks for a parent task.
	 *
	 * Generates dataset entries and creates child tasks for indexing each entry.
	 * Matches the pattern from dataset_generating for consistency.
	 *
	 * @param  Task  $task  Parent task.
	 *
	 * @return bool True if all sub-tasks are generated, false if more batches needed.
	 * @throws Exception If generation fails.
	 * @since 1.1.0
	 */
	public function generate_sub_tasks( Task $task ): bool {
		$payload    = $task->get_payload();
		$dataset_id = $payload['dataset_id'] ?? null;
		$job        = $task->job();

		if ( empty( $dataset_id ) ) {
			return true; // Invalid payload, skip
		}

		// Find the dataset
		$dataset = Dataset::find( $dataset_id );

		if ( ! $dataset instanceof Dataset ) {
			return true; // Dataset not found, skip
		}

		// Check if this is the first time generating sub-tasks for this parent
		// by checking if any child tasks already exist
		$existing_children = $this->job_task_repository->where( [
			'job_id'         => $job->get_id(),
			'parent_task_id' => $task->get_id(),
		], 1, 1, 'id', 'ASC' );

		$is_first_batch = $existing_children->is_empty();

		// On first batch, clear existing entries and generate knowledge
		if ( $is_first_batch ) {
			// Clear existing entries for fresh sync
			$this->dataset_service->clear( $dataset );

			// Clear any previous errors
			$dataset->update_meta( 'errors', wp_json_encode( [] ) );

			// Generate fresh knowledge entries - will set status to STATUS_GENERATED
			$dataset = ( new Knowledge_Generator() )->generate( $dataset );

			// Store dataset ID in parent task payload
			$task->set_payload( array_merge( $payload, [ 'dataset_id' => $dataset->get_id() ] ) );
			$task->save();
		}

		// Get indexing configuration from dataset
		$index_config_id   = $dataset->get_meta_value( 'index_config_id' );
		$index_ai_model_id = $dataset->get_meta_value( 'index_ai_model_id' );
		$vector_index_id   = $dataset->get_meta_value( 'vector_index_id' );
		$dimension         = $dataset->get_meta_value( 'dimension' );

		// Only create indexing tasks if indexing configuration exists
		if ( empty( $index_config_id ) || empty( $index_ai_model_id ) || empty( $vector_index_id ) || empty( $dimension ) ) {
			return true; // No indexing config, nothing to do
		}

		// Get all dataset entries (no limit, like dataset_generating)
		$dataset_entries = Dataset_Entry::where( [ 'dataset_id' => $dataset->get_id() ], -1, -1, 'id', 'ASC' );

		// Get existing child tasks to avoid duplicates (resume scenario)
		$existing_children_all = $this->job_task_repository->where( [
			'job_id'         => $job->get_id(),
			'parent_task_id' => $task->get_id(),
		], -1, -1, 'id', 'ASC' );

		// Build a set of existing entry IDs for quick lookup
		$existing_entry_ids = [];
		foreach ( $existing_children_all as $child ) {
			$child_payload = $child->get_payload();
			if ( isset( $child_payload['dataset_entry_id'] ) ) {
				$existing_entry_ids[ $child_payload['dataset_entry_id'] ] = true;
			}
		}

		// Create child tasks for all entries that don't already have tasks
		foreach ( $dataset_entries as $entry ) {
			if ( $entry instanceof Dataset_Entry ) {
				$entry_id = $entry->get_id();

				// Skip if child task already exists (resume scenario)
				if ( isset( $existing_entry_ids[ $entry_id ] ) ) {
					continue;
				}

				// Create child task payload with entry ID and indexing config
				$child_payload = [
					'dataset_entry_id' => $entry_id,
					'config_id'        => $index_config_id,
					'ai_model_id'      => $index_ai_model_id,
					'vector_index_id'  => $vector_index_id,
					'dimension'        => $dimension,
				];

				$this->create_sub_task( $job->get_id(), $task, $child_payload );
			}
		}

		return true;
	}

	/**
	 * Determine if an exception is critical.
	 *
	 * Uses parent's default implementation which checks common critical codes.
	 *
	 * @param  Exception  $exception  Exception that occurred.
	 *
	 * @return bool True if critical.
	 * @since 1.1.0
	 */
	public function is_critical_error( Exception $exception ): bool {
		// Use parent's default implementation
		return parent::is_critical_error( $exception );
	}
}
