<?php

namespace Limb_Chatbot\Includes\Services\Jobs\Handlers;

use Limb_Chatbot\Includes\Data_Objects\AI_Model;
use Limb_Chatbot\Includes\Data_Objects\Chatbot;
use Limb_Chatbot\Includes\Data_Objects\Config;
use Limb_Chatbot\Includes\Data_Objects\Dataset;
use Limb_Chatbot\Includes\Data_Objects\Dataset_Entry;
use Limb_Chatbot\Includes\Data_Objects\Job;
use Limb_Chatbot\Includes\Data_Objects\Task;
use Limb_Chatbot\Includes\Exceptions\Error_Codes;
use Limb_Chatbot\Includes\Exceptions\Exception;
use Limb_Chatbot\Includes\Interfaces\Multitask_Handler_Interface;
use Limb_Chatbot\Includes\Services\Dataset_Service;
use Limb_Chatbot\Includes\Services\Job\Abstract_Job_Handler;
use Limb_Chatbot\Includes\Services\Knowledge\Indexing_Service;
use Limb_Chatbot\Includes\Services\Knowledge\Knowledge_Generator;

/**
 * Dataset Regenerate Job Handler
 *
 * Handles dataset regeneration jobs. Regenerates knowledge entries with new sanitizing model/config
 * and optionally updates indexing model/config when "all" datasets are selected.
 * Similar to dataset_sync but uses new sanitizing config and optionally new indexing config.
 *
 * @since 1.1.0
 */
class Dataset_Regenerate extends Abstract_Job_Handler implements Multitask_Handler_Interface {

	/**
	 * Dataset service instance.
	 *
	 * @var Dataset_Service
	 * @since 1.1.0
	 */
	private Dataset_Service $dataset_service;

	/**
	 * Batch size for generating child tasks.
	 *
	 * @var int
	 * @since 1.1.0
	 */
	public int $child_task_batch_size = 100;

	/**
	 * Constructor.
	 *
	 * @since 1.1.0
	 */
	public function __construct() {
		parent::__construct();
		$this->dataset_service = new Dataset_Service();
	}

	/**
	 * Get the job type this handler manages.
	 *
	 * @return string
	 * @since 1.1.0
	 */
	public function get_job_type(): string {
		return Job::TYPE_DATASET_REGENERATE;
	}

	/**
	 * Validate job configuration.
	 *
	 * Validates that:
	 * - dataset_ids is provided (either "all" string or array of IDs)
	 * - ai_model_id and config_id are provided (required for data sanitizing)
	 * - When dataset_ids is "all", indexing_ai_model_id and indexing_config_id are required
	 * - When dataset_ids is an array, datasets exist and have valid indexing config
	 *
	 * @param array $config Job configuration.
	 * @param string|null $chatbot_uuid Chatbot UUID (optional).
	 *
	 * @return bool True if valid.
	 * @throws Exception If validation fails.
	 * @since 1.1.0
	 */
	public function validate( array $config, ?string $chatbot_uuid = null ): bool {
		$dataset_ids = $config['dataset_ids'] ?? null;
		$ai_model_id = $config['ai_model_id'] ?? null;
		$config_id   = $config['config_id'] ?? null;

		// Validate dataset_ids is provided
		if ( empty( $dataset_ids ) ) {
			throw new Exception(
				Error_Codes::EMPTY_VALUE,
				__( 'Dataset IDs are required for dataset regeneration.', 'limb-chatbot' )
			);
		}

		// Validate ai_model_id and config_id (required for data sanitizing)
		if ( empty( $ai_model_id ) || ! AI_Model::find( $ai_model_id ) ) {
			throw new Exception(
				Error_Codes::AI_MODEL_NOT_SET,
				__( 'Valid AI model ID is required for data sanitizing.', 'limb-chatbot' )
			);
		}

		if ( empty( $config_id ) || ! Config::find( $config_id ) ) {
			throw new Exception(
				Error_Codes::NOT_FOUND,
				__( 'Valid config ID is required for data sanitizing.', 'limb-chatbot' )
			);
		}

		// Handle "all" case
		if ( $dataset_ids === 'all' ) {
			// When "all" is selected, indexing fields are required
			$indexing_ai_model_id = $config['indexing_ai_model_id'] ?? null;
			$indexing_config_id   = $config['indexing_config_id'] ?? null;

			if ( empty( $indexing_ai_model_id ) || ! AI_Model::find( $indexing_ai_model_id ) ) {
				throw new Exception(
					Error_Codes::AI_MODEL_NOT_SET,
					__( 'Valid indexing AI model ID is required when regenerating all datasets.', 'limb-chatbot' )
				);
			}

			if ( empty( $indexing_config_id ) || ! Config::find( $indexing_config_id ) ) {
				throw new Exception(
					Error_Codes::NOT_FOUND,
					__( 'Valid indexing config ID is required when regenerating all datasets.', 'limb-chatbot' )
				);
			}
		} else {
			// Validate dataset_ids is an array
			if ( ! is_array( $dataset_ids ) ) {
				throw new Exception(
					Error_Codes::VALIDATION_INVALID_VALUE,
					__( 'Dataset IDs must be either "all" or an array of dataset IDs.', 'limb-chatbot' )
				);
			}

			// Validate datasets exist
			$datasets = Dataset::where( [ 'id' => $dataset_ids ] );
			if ( $datasets->is_empty() ) {
				throw new Exception(
					Error_Codes::NOT_FOUND,
					__( 'Datasets not found', 'limb-chatbot' )
				);
			}

			// Validate that datasets have indexing configuration
			// (we'll use existing indexing config from datasets)
			foreach ( $datasets as $dataset ) {
				$indexing_ai_model_id = $dataset->get_meta_value( 'index_ai_model_id' );
				$indexing_config_id   = $dataset->get_meta_value( 'index_config_id' );
				$vector_index_id      = $dataset->get_meta_value( 'vector_index_id' );
				$dimension            = $dataset->get_meta_value( 'dimension' );

				if ( empty( $indexing_ai_model_id ) || empty( $indexing_config_id ) || empty( $vector_index_id ) || empty( $dimension ) ) {
					throw new Exception(
						Error_Codes::VALIDATION_INVALID_VALUE,
						sprintf(
							__( 'Dataset %s is missing required indexing configuration.', 'limb-chatbot' ),
							$dataset->get_id()
						)
					);
				}
			}
		}

		// Validate chatbot if provided
		if ( ! empty( $chatbot_uuid ) && $chatbot_uuid !== Job::CHATBOT_DEFAULT ) {
			$chatbot = Chatbot::find_by_uuid( $chatbot_uuid );
			if ( empty( $chatbot ) ) {
				throw new Exception(
					Error_Codes::VALIDATION_INVALID_VALUE,
					__( 'The specified chatbot does not exist.', 'limb-chatbot' )
				);
			}
		}

		return true;
	}

	/**
	 * Get total number of datasets to regenerate.
	 *
	 * Calculates total without actually fetching datasets.
	 * Essential for large regeneration operations to prevent timeouts.
	 *
	 * @param array $config Job configuration.
	 *
	 * @return int Total task count.
	 * @throws Exception If calculation fails.
	 * @since 1.1.0
	 */
	public function get_total( array $config, Job $job ): int {
		$dataset_ids = $config['dataset_ids'] ?? null;

		if ( $dataset_ids === 'all' ) {
			// For "all" case, we can't determine the exact count without sub_type
			// The actual count will be calculated during generate_task_batch
			// Return a placeholder - the stats service will update it dynamically
			// when tasks are actually generated
			return 1; // Placeholder - actual count calculated in generate_task_batch
		}

		if ( empty( $dataset_ids ) || ! is_array( $dataset_ids ) ) {
			throw new Exception(
				Error_Codes::EMPTY_VALUE,
				__( 'There is no dataset to regenerate', 'limb-chatbot' )
			);
		}

		$total = count( (array) $dataset_ids );

		if ( empty( $total ) ) {
			throw new Exception(
				Error_Codes::EMPTY_VALUE,
				__( 'Total is empty', 'limb-chatbot' )
			);
		}

		return (int) $total;
	}

	/**
	 * Generate a batch of tasks for dataset regeneration.
	 *
	 * Creates one task per dataset ID. Each task will regenerate a single dataset.
	 * Also sets datasets to PENDING status before processing.
	 *
	 * @param Job $job Job instance.
	 * @param array $config Job configuration.
	 * @param int $offset Starting offset for this batch.
	 * @param int $limit Maximum number of tasks to generate.
	 *
	 * @return int Number of tasks actually created.
	 * @since 1.1.0
	 */
	public function generate_task_batch( Job $job, array $config, int $offset, int $limit ): int {
		$dataset_ids = $config['dataset_ids'] ?? null;

		// Handle "all" case
		if ( $dataset_ids === 'all' ) {
			// Get all datasets of the appropriate type based on job sub_type
			$dataset_type = $job->get_sub_type() === 'informational'
				? Dataset::TYPE_INFORMATIONAL_KNOWLEDGE
				: Dataset::TYPE_ACTIONABLE_KNOWLEDGE;

			$all_datasets    = Dataset::where( [ 'type' => $dataset_type ], - 1, - 1, 'id', 'ASC' );
			$all_dataset_ids = array_map( function ( $dataset ) {
				return $dataset->get_id();
			}, $all_datasets->get() );

			// Update stats with actual total on first batch (when offset is 0 and stats not yet set)
			// This ensures we only update once, even if job is resumed
			$stats         = $job->get_stats() ?? [];
			$current_total = $stats['total_tasks'] ?? 0;
			if ( $offset === 0 && ! empty( $all_dataset_ids ) && ( $current_total === 0 || $current_total === 1 ) ) {
				// Only update if total is 0 (initial) or 1 (placeholder from get_total)
				$stats['total_tasks']        = count( $all_dataset_ids );
				$stats['parent_tasks_total'] = count( $all_dataset_ids );
				$job->set_stats( $stats );
				$job->save();
			}

			// Get the slice of dataset IDs for this batch
			$batch_ids = array_slice( $all_dataset_ids, $offset, $limit );
		} else {
			if ( empty( $dataset_ids ) || ! is_array( $dataset_ids ) ) {
				return 0;
			}

			// Get the slice of dataset IDs for this batch
			$batch_ids = array_slice( $dataset_ids, $offset, $limit );
		}

		if ( empty( $batch_ids ) ) {
			return 0;
		}

		// Set datasets to PENDING status before processing
		$datasets = Dataset::where( [ 'id' => $batch_ids ] );
		foreach ( $datasets as $dataset ) {
			$dataset->set_status( Dataset::STATUS_PENDING );
			$dataset->save();
		}

		$task_count = 0;

		foreach ( $batch_ids as $dataset_id ) {
			// Create task payload with dataset ID
			$payload = [
				'dataset_id' => (int) $dataset_id,
			];

			// Create task
			if ( $this->create_task( $job->get_id(), $payload ) ) {
				$task_count ++;
			}
		}

		return $task_count;
	}

	/**
	 * Process a single task.
	 *
	 * If task has a parent (child task), it's an indexing task.
	 * Otherwise, it's a parent task that will generate sub-tasks.
	 *
	 * @param Task $task Task to process.
	 *
	 * @return bool True on success.
	 * @throws Exception If processing fails.
	 * @since 1.1.0
	 */
	public function process_task( Task $task ): bool {
		// Check if this is a child task (indexing task)
		if ( $task->get_parent_task_id() ) {
			// This is a child task - index the entry
			$indexing_service = new Indexing_Service();

			return $indexing_service->index_entry( $task );
		}

		// This is a parent task - generate sub-tasks (entries)
		// The parent task processing is handled by generate_sub_tasks
		// which is called by the multitask processor
		return true;
	}

	/**
	 * Generate sub-tasks for a parent task.
	 *
	 * Regenerates dataset entries with new sanitizing model/config and creates child tasks for indexing.
	 * When "all" is selected, also updates indexing model/config for all datasets.
	 *
	 * @param Task $task Parent task.
	 *
	 * @return bool True if all sub-tasks are generated, false if more batches needed.
	 * @throws Exception If generation fails.
	 * @since 1.1.0
	 */
	public function generate_sub_tasks( Task $task ): bool {
		$payload    = $task->get_payload();
		$dataset_id = $payload['dataset_id'] ?? null;
		$job        = $task->job();

		if ( empty( $dataset_id ) ) {
			return true; // Invalid payload, skip
		}

		// Find the dataset
		$dataset = Dataset::find( $dataset_id );

		if ( ! $dataset instanceof Dataset ) {
			return true; // Dataset not found, skip
		}

		// Check if this is the first time generating sub-tasks for this parent
		// by checking if any child tasks already exist
		$existing_children = $this->job_task_repository->where( [
			'job_id'         => $job->get_id(),
			'parent_task_id' => $task->get_id(),
		], 1, 1, 'id', 'ASC' );

		$is_first_batch = $existing_children->is_empty();

		// On first batch, update dataset meta and regenerate knowledge
		if ( $is_first_batch ) {
			// Update data sanitizing model and config from job config
			$ai_model_id = $job->get_config_value( 'ai_model_id' );
			$config_id   = $job->get_config_value( 'config_id' );

			if ( ! empty( $ai_model_id ) ) {
				$dataset->update_meta( 'ai_model_id', $ai_model_id );
			}
			if ( ! empty( $config_id ) ) {
				$dataset->update_meta( 'config_id', $config_id );
			}

			// If "all" is selected, update indexing model and config from job config
			$dataset_ids = $job->get_config_value( 'dataset_ids' );
			if ( $dataset_ids === 'all' ) {
				$indexing_ai_model_id = $job->get_config_value( 'indexing_ai_model_id' );
				$indexing_config_id   = $job->get_config_value( 'indexing_config_id' );

				if ( ! empty( $indexing_ai_model_id ) ) {
					$dataset->update_meta( 'index_ai_model_id', $indexing_ai_model_id );
				}
				if ( ! empty( $indexing_config_id ) ) {
					$dataset->update_meta( 'index_config_id', $indexing_config_id );
				}
			}

			// Clear existing entries for fresh regeneration
			$this->dataset_service->clear( $dataset );

			// Clear any previous errors
			$dataset->update_meta( 'errors', wp_json_encode( [] ) );

			// Generate fresh knowledge entries with new sanitizing model/config
			// This will use the updated ai_model_id and config_id from meta
			$dataset = ( new Knowledge_Generator() )->generate( $dataset );

			// Store dataset ID in parent task payload
			$task->set_payload( array_merge( $payload, [ 'dataset_id' => $dataset->get_id() ] ) );
			$task->save();
		}

		// Get indexing configuration from dataset (either updated or existing)
		$index_config_id   = $dataset->get_meta_value( 'index_config_id' );
		$index_ai_model_id = $dataset->get_meta_value( 'index_ai_model_id' );
		$vector_index_id   = $dataset->get_meta_value( 'vector_index_id' );
		$dimension         = $dataset->get_meta_value( 'dimension' );

		// Only create indexing tasks if indexing configuration exists
		if ( empty( $index_config_id ) || empty( $index_ai_model_id ) || empty( $vector_index_id ) || empty( $dimension ) ) {
			return true; // No indexing config, nothing to do
		}

		// Get all dataset entries (no limit, like dataset_sync)
		$dataset_entries = Dataset_Entry::where( [ 'dataset_id' => $dataset->get_id() ], - 1, - 1, 'id', 'ASC' );

		// Get existing child tasks to avoid duplicates (resume scenario)
		$existing_children_all = $this->job_task_repository->where( [
			'job_id'         => $job->get_id(),
			'parent_task_id' => $task->get_id(),
		], - 1, - 1, 'id', 'ASC' );

		// Build a set of existing entry IDs for quick lookup
		$existing_entry_ids = [];
		foreach ( $existing_children_all as $child ) {
			$child_payload = $child->get_payload();
			if ( isset( $child_payload['dataset_entry_id'] ) ) {
				$existing_entry_ids[ $child_payload['dataset_entry_id'] ] = true;
			}
		}

		// Create child tasks for all entries that don't already have tasks
		foreach ( $dataset_entries as $entry ) {
			if ( $entry instanceof Dataset_Entry ) {
				$entry_id = $entry->get_id();

				// Skip if child task already exists (resume scenario)
				if ( isset( $existing_entry_ids[ $entry_id ] ) ) {
					continue;
				}

				// Create child task payload with entry ID and indexing config
				$child_payload = [
					'dataset_entry_id' => $entry_id,
					'config_id'        => $index_config_id,
					'ai_model_id'      => $index_ai_model_id,
					'vector_index_id'  => $vector_index_id,
					'dimension'        => $dimension,
				];

				$this->create_sub_task( $job->get_id(), $task, $child_payload );
			}
		}

		return true;
	}

	/**
	 * Determine if an exception is critical.
	 *
	 * Uses parent's default implementation which checks common critical codes.
	 *
	 * @param Exception $exception Exception that occurred.
	 *
	 * @return bool True if critical.
	 * @since 1.1.0
	 */
	public function is_critical_error( Exception $exception ): bool {
		// Use parent's default implementation
		return parent::is_critical_error( $exception );
	}
}
