<?php

namespace Limb_Chatbot\Includes\Services\Knowledge\Content_Extractors;

use Limb_Chatbot\Includes\Data_Objects\File;
use Limb_Chatbot\Includes\Exceptions\Error_Codes;
use Limb_Chatbot\Includes\Exceptions\Exception;
use Limb_Chatbot\Includes\Services\Helper;
use Smalot\PdfParser\Parser;

/**
 * PDF Content Extractor for extracting text from PDF files.
 *
 * Extracts and normalizes PDF content using the pdfparser library:
 * - Extracts text from all pages
 * - Converts to simple HTML paragraphs (no artificial headings)
 * - Content is parsed as-is and will be chunked semantically later
 *
 * @since 1.0.0
 */
class PDF_Content_Extractor {

	/**
	 * Extract and normalize content from a PDF file.
	 *
	 * @param  File  $file  The File object containing PDF metadata.
	 *
	 * @return string Normalized HTML content ready for chunking.
	 * @throws Exception If file not found, not a PDF, or parsing fails.
	 * @since 1.0.0
	 */
	public function extract( File $file ): string {
		// Validate file is a PDF
		if ( $file->get_mime_type() !== 'application/pdf' ) {
			throw new Exception(
				Error_Codes::VALIDATION_INVALID_VALUE,
				__( 'File must be a PDF (application/pdf)', 'limb-chatbot' )
			);
		}

		// Get full file path
		$file_path = Helper::get_wp_uploaded_file_dir( $file->get_file_path() );

		if ( ! $file_path || ! file_exists( $file_path ) ) {
			throw new Exception(
				Error_Codes::FILE_NOT_FOUND,
				sprintf( __( 'PDF file not found at path: %s', 'limb-chatbot' ), $file_path ?? 'unknown' )
			);
		}

		// Ensure pdfparser library is loaded
		// Check if classes exist, if not, load the autoloader
		// This lazy loading approach avoids conflicts with other plugins that may also use pdfparser
		if ( ! class_exists( 'Smalot\PdfParser\Parser', false ) ) {
			// Get plugin directory path
			$plugin_dir = defined( 'LIMB_CHATBOT_FILE' )
				? plugin_dir_path( LIMB_CHATBOT_FILE )
				: dirname( dirname( dirname( dirname( __DIR__ ) ) ) ) . '/';

			$autoloader_path = $plugin_dir . 'includes/lib/pdfparser/autoload.php';
			if ( file_exists( $autoloader_path ) ) {
				require_once $autoloader_path;
			}
		}

		// Parse PDF using pdfparser library
		try {
			$parser = new Parser();
			$pdf    = $parser->parseFile( $file_path );
			$text   = $pdf->getText();
		} catch ( \Exception $e ) {
			throw new Exception(
				Error_Codes::TECHNICAL_ERROR,
				sprintf( __( 'Failed to parse PDF: %s', 'limb-chatbot' ), $e->getMessage() ),
				[ 'original_exception' => $e->getMessage() ]
			);
		}

		if ( empty( $text ) ) {
			throw new Exception(
				Error_Codes::EMPTY_VALUE,
				__( 'No text content found in PDF', 'limb-chatbot' )
			);
		}
		// Normalize the extracted text to HTML-like format for chunking
		return $this->normalize( $text );
	}

	/**
	 * Normalize extracted PDF text to HTML-like format.
	 *
	 * Converts plain text to simple HTML paragraphs, preserving the original line structure.
	 * Each non-empty line becomes a paragraph, preserving headings and structure as-is.
	 * The content will be parsed as-is and later chunked semantically by the chunker.
	 *
	 * @param  string  $text  Raw text extracted from PDF.
	 *
	 * @return string Normalized HTML content with simple paragraphs preserving structure.
	 * @since 1.0.0
	 */
	private function normalize( string $text ): string {
		// Clean up the text
		$text = trim( $text );

		// Normalize line breaks
		$text = preg_replace( '/\r\n|\r/', "\n", $text );

		// Split into lines and preserve structure
		// Each non-empty line becomes a paragraph to preserve original structure
		$lines = explode( "\n", $text );
		$html_blocks = [];

		foreach ( $lines as $line ) {
			$line = trim( $line );

			// Skip empty lines (they create natural spacing between paragraphs)
			if ( empty( $line ) ) {
				continue;
			}

			// Each non-empty line becomes its own paragraph
			// This preserves headings, titles, and structure as they appear in the PDF
			$html_blocks[] = '<p>' . htmlspecialchars( $line, ENT_QUOTES | ENT_HTML5, 'UTF-8' ) . '</p>';
		}

		// Join blocks with newlines
		$html = implode( "\n", $html_blocks );

		// Clean up structure
		$html = preg_replace( '/<p>\s*<\/p>/', '', $html );
		$html = preg_replace( '/\n{3,}/', "\n\n", $html );

		return trim( $html );
	}
}
