<?php

namespace Limb_Chatbot\Includes\Services\Jobs\Handlers;

use Limb_Chatbot\Includes\Data_Objects\Chatbot;
use Limb_Chatbot\Includes\Data_Objects\Job;
use Limb_Chatbot\Includes\Data_Objects\Task;
use Limb_Chatbot\Includes\Exceptions\Error_Codes;
use Limb_Chatbot\Includes\Exceptions\Exception;
use Limb_Chatbot\Includes\Repositories\Job_Repository;
use Limb_Chatbot\Includes\Services\Job\Abstract_Job_Handler;

/**
 * Sitemap Scrape Job Handler
 *
 * Handles sitemap scraping jobs. Creates tasks for each URL discovered
 * (both sitemap URLs and page URLs). Each task processes one URL,
 * and sitemap URLs generate additional tasks for discovered URLs.
 * Updates chatbot parameter with the complete list of scraped page URLs.
 *
 * @since 1.1.0
 */
class Sitemap_Scrape extends Abstract_Job_Handler {

	/**
	 * Maximum depth for recursive sitemap scraping to prevent infinite loops.
	 *
	 * @var int
	 * @since 1.1.0
	 */
	private int $max_depth = 10;

	/**
	 * Get the job type this handler manages.
	 *
	 * @return string
	 * @since 1.1.0
	 */
	public function get_job_type(): string {
		return Job::TYPE_SITEMAP_SCRAPE;
	}

	/**
	 * Validate job configuration.
	 *
	 * Validates that sitemap URL is provided and chatbot UUID is valid if provided.
	 *
	 * @param  array  $config  Job configuration.
	 * @param  string|null  $chatbot_uuid  Chatbot UUID (optional).
	 *
	 * @return bool True if valid.
	 * @throws Exception If validation fails.
	 * @since 1.1.0
	 */
	public function validate( array $config, ?string $chatbot_uuid = null ): bool {
		$sitemap_url = $config['sitemap_url'] ?? null;
		$chatbot_uuid = $chatbot_uuid === Job::CHATBOT_DEFAULT ? null : $chatbot_uuid;

		// Validate sitemap URL
		if ( empty( $sitemap_url ) ) {
			throw new Exception(
				Error_Codes::EMPTY_VALUE,
				__( 'Sitemap URL is required.', 'limb-chatbot' )
			);
		}

		// Validate URL format
		if ( ! filter_var( $sitemap_url, FILTER_VALIDATE_URL ) ) {
			throw new Exception(
				Error_Codes::VALIDATION_INVALID_VALUE,
				__( 'Invalid sitemap URL format.', 'limb-chatbot' )
			);
		}

		// Validate chatbot if provided
		if ( ! empty( $chatbot_uuid ) ) {
			$chatbot = Chatbot::find_by_uuid( $chatbot_uuid );
			if ( empty( $chatbot ) ) {
				throw new Exception(
					Error_Codes::VALIDATION_INVALID_VALUE,
					__( 'The specified chatbot does not exist.', 'limb-chatbot' )
				);
			}
		}

		return true;
	}

	/**
	 * Get total number of tasks that will be generated.
	 *
	 * For sitemap scraping, we can't know the total upfront since we discover
	 * URLs as we process. We'll return a conservative estimate based on
	 * the initial sitemap, but the actual count will be dynamic.
	 *
	 * @param  array  $config  Job configuration.
	 * @param  Job  $job  Job instance.
	 *
	 * @return int Total task count estimate.
	 * @since 1.1.0
	 */
	public function get_total( array $config, Job $job ): int {
		// We can't know the total upfront since we discover URLs dynamically
		// Return 1 for the initial sitemap task, actual count will grow as we discover URLs
		return 1;
	}

	/**
	 * Generate a batch of tasks for sitemap scraping.
	 *
	 * For the initial batch, parses the sitemap URL and creates tasks for
	 * each URL found (both sitemap URLs and page URLs).
	 * Subsequent batches are generated dynamically as sitemap tasks are processed.
	 *
	 * @param  Job  $job  Job instance.
	 * @param  array  $config  Job configuration.
	 * @param  int  $offset  Starting offset for this batch.
	 * @param  int  $limit  Maximum number of tasks to generate.
	 *
	 * @return int Number of tasks actually created.
	 * @since 1.1.0
	 */
	public function generate_task_batch( Job $job, array $config, int $offset, int $limit ): int {
		$sitemap_url = $config['sitemap_url'] ?? null;

		if ( empty( $sitemap_url ) || $offset > 0 ) {
			return 0;
		}

		// Initialize collected URLs array in job config if not exists
		$config['collected_urls'] = [];
		$config['visited_sitemaps'] = [];
		$job->set_config( $config );
		$job->save();

		// Parse initial sitemap and create tasks for each URL found
		$urls = $this->parse_sitemap( $sitemap_url );
		
		if ( empty( $urls ) ) {
			// If parsing fails, still create one task to handle the initial sitemap
			$payload = [
				'url' => $sitemap_url,
				'type' => 'sitemap',
				'depth' => 0,
			];
			if ( $this->create_task( $job->get_id(), $payload ) ) {
				// Update total tasks in stats since we now know there's at least 1
				$stats = $job->get_stats() ?? [];
				$stats['total_tasks'] = 1;
				$job->set_stats( $stats );
				$job->save();
				return 1;
			}
			return 0;
		}

		// For sitemap scraping, we need to create ALL tasks from the initial sitemap
		// because we can't know the total upfront. The limit is ignored for the initial batch
		// to ensure all discovered URLs get tasks created.
		$task_count = 0;
		foreach ( $urls as $url_data ) {
			$payload = [
				'url' => $url_data['url'],
				'type' => $url_data['type'],
				'depth' => 0,
			];

			if ( $this->create_task( $job->get_id(), $payload ) ) {
				$task_count++;
			}
		}

		// Update total tasks in stats with the actual count we created
		// This helps the system know how many tasks were generated
		if ( $task_count > 0 ) {
			$stats = $job->get_stats() ?? [];
			$stats['total_tasks'] = $task_count;
			$job->set_stats( $stats );
			$job->save();
		}

		return $task_count;
	}

	/**
	 * Process a single task (process one URL).
	 *
	 * If the task is for a sitemap URL, parses it and creates new tasks
	 * for discovered URLs. If it's a page URL, collects it for final processing.
	 *
	 * @param  Task  $task  Task to process.
	 *
	 * @return bool True on success, false on failure.
	 * @throws Exception If processing fails.
	 * @since 1.1.0
	 */
	public function process_task( Task $task ): bool {
		$payload = $task->get_payload();
		$url = $payload['url'] ?? null;
		$type = $payload['type'] ?? 'page';
		$depth = $payload['depth'] ?? 0;

		if ( empty( $url ) ) {
			return false;
		}

		// Get job and config
		$job = $task->job();
		if ( ! $job ) {
			return false;
		}

		$config = $job->get_config();
		$collected_urls = $config['collected_urls'] ?? [];
		$visited_sitemaps = $config['visited_sitemaps'] ?? [];

		if ( $type === 'sitemap' ) {
			// Check depth limit
			if ( $depth >= $this->max_depth ) {
				$this->log( sprintf( 'Maximum depth reached for sitemap: %s', $url ) );
				return true;
			}

			// Check if already visited
			if ( in_array( $url, $visited_sitemaps, true ) ) {
				$this->log( sprintf( 'Already visited sitemap: %s', $url ) );
				return true;
			}

			// Mark as visited
			$visited_sitemaps[] = $url;
			$config['visited_sitemaps'] = $visited_sitemaps;
			$job->set_config( $config );
			$job->save();

			// Parse sitemap and create tasks for discovered URLs
			$discovered_urls = $this->parse_sitemap( $url );
			
			foreach ( $discovered_urls as $url_data ) {
				$new_payload = [
					'url' => $url_data['url'],
					'type' => $url_data['type'],
					'depth' => $depth + 1,
				];

				// Create task for each discovered URL (both sitemap and page URLs)
				$this->create_task( $job->get_id(), $new_payload );
			}
		} else {
			// This is a page URL - collect it
			if ( ! in_array( $url, $collected_urls, true ) ) {
				$collected_urls[] = $url;
				
				// Update chatbot parameter incrementally
				$this->update_chatbot_parameter( $job, $url );
			}
		}

		// Update collected URLs in job config
		$config['collected_urls'] = $collected_urls;
		$job->set_config( $config );
		$job->save();

		return true;
	}

	/**
	 * Update chatbot parameter with a new URL incrementally.
	 *
	 * Gets the current sitemap_urls parameter, adds the new URL if not present,
	 * and updates the parameter.
	 *
	 * @param  Job  $job  Job instance.
	 * @param  string  $url  URL to add.
	 *
	 * @return void
	 * @since 1.1.0
	 */
	private function update_chatbot_parameter( Job $job, string $url ): void {
		$chatbot_uuid = $job->get_chatbot_uuid();
		$chatbot_uuid = $chatbot_uuid === Job::CHATBOT_DEFAULT ? null : $chatbot_uuid;

		// Get or create chatbot
		if ( ! empty( $chatbot_uuid ) ) {
			$chatbot = Chatbot::find_by_uuid( $chatbot_uuid );
			if ( empty( $chatbot ) ) {
				$this->log( sprintf( 'Chatbot not found for parameter update. UUID: %s', $chatbot_uuid ) );
				return;
			}
		} else {
			$chatbot = Chatbot::make();
		}

		// Get current sitemap URLs
		$current_urls = $chatbot->get_parameter( 'sitemap_urls' );
		
		// Initialize as empty array if not set
		if ( ! is_array( $current_urls ) ) {
			$current_urls = [];
		}

		// Add new URL if not already present
		if ( ! in_array( $url, $current_urls, true ) ) {
			$current_urls[] = $url;
			// Remove duplicates and re-index
			$current_urls = array_values( array_unique( $current_urls ) );
			
			// Update parameter
			$chatbot->update_parameter( 'sitemap_urls', $current_urls );
			$this->log( sprintf( 'Updated chatbot parameter with URL: %s (total: %d)', $url, count( $current_urls ) ) );
		}
	}

	/**
	 * Parse a single sitemap and return discovered URLs.
	 *
	 * Parses one sitemap URL and returns an array of discovered URLs
	 * with their types (sitemap or page). Does not recurse - that's
	 * handled by creating new tasks.
	 *
	 * @param  string  $sitemap_url  URL of the sitemap to parse.
	 *
	 * @return array Array of URL data with 'url' and 'type' keys.
	 * @since 1.1.0
	 */
	private function parse_sitemap( string $sitemap_url ): array {
		$urls = [];

		try {
			// Fetch sitemap content
			$response = wp_remote_get( $sitemap_url, [
				'timeout' => 30,
				'sslverify' => false, // Allow self-signed certificates for local development
			] );

			if ( is_wp_error( $response ) ) {
				$this->log( sprintf( 'Error fetching sitemap %s: %s', $sitemap_url, $response->get_error_message() ) );
				return [];
			}

			$status_code = wp_remote_retrieve_response_code( $response );
			if ( $status_code !== 200 ) {
				$this->log( sprintf( 'Non-200 status code for sitemap %s: %d', $sitemap_url, $status_code ) );
				return [];
			}

			$body = wp_remote_retrieve_body( $response );
			if ( empty( $body ) ) {
				$this->log( sprintf( 'Empty response body for sitemap: %s', $sitemap_url ) );
				return [];
			}

			// Parse XML
			libxml_use_internal_errors( true );
			$xml = simplexml_load_string( $body );

			if ( $xml === false ) {
				$errors = libxml_get_errors();
				$error_messages = array_map( function( $error ) {
					return trim( $error->message );
				}, $errors );
				$this->log( sprintf( 'Failed to parse XML for sitemap %s: %s', $sitemap_url, implode( '; ', $error_messages ) ) );
				libxml_clear_errors();
				return [];
			}

			// Get namespaces
			$namespaces = $xml->getNamespaces( true );
			
			// Register standard sitemap namespace
			$xml->registerXPathNamespace( 'sm', 'http://www.sitemaps.org/schemas/sitemap/0.9' );
			
			// Register all found namespaces
			foreach ( $namespaces as $prefix => $namespace ) {
				if ( ! empty( $prefix ) ) {
					$xml->registerXPathNamespace( $prefix, $namespace );
				}
			}

			// Check root element name to determine sitemap type
			$root_name = $xml->getName();

			// Try to find sitemap elements first (nested sitemaps)
			// WordPress sitemaps use local-name() to ignore namespaces
			$sitemap_elements = $xml->xpath( '//*[local-name()="sitemap"]/*[local-name()="loc"]' );
			
			// If no results, try with namespace
			if ( empty( $sitemap_elements ) ) {
				$sitemap_elements = $xml->xpath( '//sm:sitemap/sm:loc | //sitemap/loc | //sitemap:loc' );
			}
			
			// Fallback: direct traversal if XPath fails
			if ( empty( $sitemap_elements ) ) {
				$children = $xml->children();
				foreach ( $children as $child ) {
					if ( stripos( $child->getName(), 'sitemap' ) !== false ) {
						$loc = $child->children( 'http://www.sitemaps.org/schemas/sitemap/0.9' )->loc;
						if ( empty( $loc ) ) {
							$loc = $child->loc;
						}
						if ( ! empty( $loc ) ) {
							$sitemap_elements[] = $loc;
						}
					}
				}
			}
			
			if ( ! empty( $sitemap_elements ) ) {
				// This is a sitemap index - return nested sitemap URLs
				foreach ( $sitemap_elements as $sitemap_element ) {
					$nested_sitemap_url = trim( (string) $sitemap_element );
					if ( ! empty( $nested_sitemap_url ) ) {
						$urls[] = [
							'url' => $nested_sitemap_url,
							'type' => 'sitemap',
						];
					}
				}
			} else {
				// This is a regular sitemap - extract page URLs
				// Try local-name() first (works with any namespace)
				$url_elements = $xml->xpath( '//*[local-name()="url"]/*[local-name()="loc"]' );
				
				// If no results, try with namespace
				if ( empty( $url_elements ) ) {
					$url_elements = $xml->xpath( '//sm:url/sm:loc | //url/loc | //url:loc' );
				}

				// Fallback: direct traversal if XPath fails
				if ( empty( $url_elements ) ) {
					$children = $xml->children();
					foreach ( $children as $child ) {
						if ( stripos( $child->getName(), 'url' ) !== false ) {
							$loc = $child->children( 'http://www.sitemaps.org/schemas/sitemap/0.9' )->loc;
							if ( empty( $loc ) ) {
								$loc = $child->loc;
							}
							if ( ! empty( $loc ) ) {
								$url_elements[] = $loc;
							}
						}
					}
				}

				if ( ! empty( $url_elements ) ) {
					foreach ( $url_elements as $url_element ) {
						$url = trim( (string) $url_element );
						if ( ! empty( $url ) ) {
							$urls[] = [
								'url' => $url,
								'type' => 'page',
							];
						}
					}
				} else {
					// Log for debugging - try to see what the XML structure actually is
					$child_names = [];
					foreach ( $xml->children() as $child ) {
						$child_names[] = $child->getName();
					}
					$this->log( sprintf( 
						'No sitemap or URL elements found in: %s. Root: %s, Children: %s, Namespaces: %s, Body preview: %s', 
						$sitemap_url,
						$root_name,
						implode( ', ', array_unique( $child_names ) ),
						implode( ', ', array_keys( $namespaces ) ),
						substr( $body, 0, 500 )
					) );
				}
			}

			return $urls;

		} catch ( \Exception $e ) {
			$this->log( sprintf( 'Exception while parsing sitemap %s: %s', $sitemap_url, $e->getMessage() ) );
			return [];
		}
	}

	/**
	 * Handle job completion.
	 *
	 * Called by the job state manager when the job is completed.
	 * Verifies that all URLs were collected and logs completion.
	 *
	 * @param  Job  $job  Completed job instance.
	 *
	 * @return void
	 * @since 1.1.0
	 */
	public function complete( Job $job ): void {
		// Reload job to ensure we have the latest config with all collected URLs
		$job_repository = new Job_Repository();
		$fresh_job = $job_repository->find( $job->get_id() );
		
		if ( ! $fresh_job ) {
			$this->log( 'Job not found during completion - may have been deleted.' );
			return;
		}

		// Get collected URLs from job config for verification
		$config = $fresh_job->get_config();
		$collected_urls = $config['collected_urls'] ?? [];
		
		$chatbot_uuid = $fresh_job->get_chatbot_uuid();
		$chatbot_uuid = $chatbot_uuid === Job::CHATBOT_DEFAULT ? null : $chatbot_uuid;

		// Get chatbot to verify parameter was updated
		if ( ! empty( $chatbot_uuid ) ) {
			$chatbot = Chatbot::find_by_uuid( $chatbot_uuid );
			if ( empty( $chatbot ) ) {
				$this->log( sprintf( 'Chatbot not found during completion. UUID: %s', $chatbot_uuid ) );
				return;
			}
		} else {
			$chatbot = Chatbot::make();
		}

		// Verify parameter exists and has URLs
		$parameter_urls = $chatbot->get_parameter( 'sitemap_urls' );
		$parameter_count = is_array( $parameter_urls ) ? count( $parameter_urls ) : 0;
		
		$this->log( sprintf( 
			'Job completed. Collected %d URLs in config, %d URLs in chatbot parameter.', 
			count( $collected_urls ),
			$parameter_count
		) );
	}

	/**
	 * Determine if an exception is critical.
	 *
	 * Uses parent's default implementation which checks common critical codes.
	 *
	 * @param  Exception  $exception  Exception that occurred.
	 *
	 * @return bool True if critical.
	 * @since 1.1.0
	 */
	public function is_critical_error( Exception $exception ): bool {
		// Use parent's default implementation
		return parent::is_critical_error( $exception );
	}
}
