<?php

namespace Limb_Chatbot\Includes\Services\Knowledge\Stringifiers;

use Limb_Chatbot\Includes\Interfaces\Chunk_Stringifier_Interface;

/**
 * Professional Chunk Stringifier (Enterprise RAG Optimized)
 *
 * Provides three distinct stringification strategies:
 * 1. For vector embedding – maximizes semantic signal with structured context
 * 2. For LLM inference – human-readable, hierarchical, source-attributed
 * 3. For admin display – comprehensive debugging and audit view
 *
 * Follows Chatbase-inspired formatting with enhanced metadata integration.
 *
 * @since 1.3.0
 */
class Chunk_Stringifier implements Chunk_Stringifier_Interface {

	/**
	 * Stringify chunk for vector embedding.
	 *
	 * Goal: Maximize semantic retrieval accuracy by prioritizing content while
	 * strategically injecting hierarchical and source context.
	 *
	 * Format:
	 * [Primary Content]
	 *
	 * Section: Document Title > Chapter > Section > Subsection
	 * Source: https://example.com/page
	 *
	 * @param  array  $chunk  Chunk data from Enhanced_Heading_Aware_Chunker
	 *
	 * @return string Optimized for embedding models
	 */
	public function stringify_for_embedding( array $chunk ): string {
		$parts = [];

		// 1. Primary content – highest semantic weight
		// Include overlap for better context preservation
		$content = $this->build_content_with_overlap(
			$chunk['content'] ?? '',
			$chunk['overlap_start'] ?? $chunk['metadata']['overlap_start'] ?? null,
			$chunk['overlap_end'] ?? $chunk['metadata']['overlap_end'] ?? null
		);
		$content = $this->clean_content_for_embedding( $content );

		if ( ! empty( $content ) ) {
			$parts[] = $content;
		}

		// 2. Structured context block – boosts hierarchical retrieval
		$context_lines = [];

		$heading_path = $chunk['heading_path'] ?? '';
		$post_title   = $chunk['metadata']['post_title'] ?? $chunk['post_title'] ?? '';

		if ( ! empty( $heading_path ) ) {
			$context_lines[] = "Section: {$heading_path}";
		} elseif ( ! empty( $post_title ) ) {
			$context_lines[] = "Section: {$post_title}";
		}

		$source_url = $chunk['metadata']['source_url'] ?? $chunk['source_url'] ?? null;
		if ( ! empty( $source_url ) ) {
			$context_lines[] = "Source: {$source_url}";
		}

		$semantic_type = $chunk['metadata']['semantic_type'] ?? null;
		if ( ! empty( $semantic_type ) && $semantic_type !== 'paragraph' ) {
			$context_lines[] = "Type: " . ucfirst( $semantic_type );
		}

		if ( ! empty( $context_lines ) ) {
			$parts[] = implode( "\n", $context_lines );
		}

		$result = implode( "\n\n", array_filter( $parts ) );

		return $this->normalize_for_embedding( $result );
	}

	/**
	 * Build content with overlap prepended and appended.
	 *
	 * @param  string  $content  Main content.
	 * @param  string|null  $overlap_start  Overlap from previous chunk.
	 * @param  string|null  $overlap_end  Overlap for next chunk.
	 *
	 * @return string Content with overlap.
	 * @since 1.2.1
	 */
	private function build_content_with_overlap( string $content, ?string $overlap_start = null, ?string $overlap_end = null ): string {
		$parts = [];
		
		// Prepend overlap from previous chunk
		if ( ! empty( $overlap_start ) ) {
			$parts[] = trim( $overlap_start );
		}
		
		// Main content
		if ( ! empty( $content ) ) {
			$parts[] = trim( $content );
		}
		
		// Append overlap for next chunk
		if ( ! empty( $overlap_end ) ) {
			$parts[] = trim( $overlap_end );
		}
		
		return implode( "\n\n", array_filter( $parts ) );
	}

	/**
	 * Clean and prepare content for embedding – remove overlaps, normalize.
	 * 
	 * IMPORTANT: Preserves headings in content - they are part of the content, not metadata.
	 */
	private function clean_content_for_embedding( string $content ): string {
		$content = trim( $content );

		// Remove only metadata lines (Section:, Source:, Type:) if accidentally baked in
		// (Defensive – should not happen with metadata overlap strategy)
		// DO NOT remove markdown headings (##) as they are part of the content
		if ( preg_match( '/^Section:/m', $content ) || preg_match( '/^Source:/m', $content ) || preg_match( '/^Type:/m', $content ) ) {
			// Remove metadata lines but preserve all content including headings
			$lines            = explode( "\n", $content );
			$clean_lines      = [];
			$in_context_block = false;
			foreach ( $lines as $line ) {
				$trimmed_line = trim( $line );
				// Only skip metadata lines, not headings
				if ( str_starts_with( $trimmed_line, 'Section:' ) || 
				     str_starts_with( $trimmed_line, 'Source:' ) || 
				     str_starts_with( $trimmed_line, 'Type:' ) ) {
					$in_context_block = true;
					continue;
				}
				if ( $in_context_block && $trimmed_line === '' ) {
					$in_context_block = false;
					continue;
				}
				// Preserve all content including headings
				$clean_lines[] = $line;
			}
			$content = implode( "\n", $clean_lines );
		}

		return trim( $content );
	}

	/**
	 * Final normalization for embedding – consistent, clean text.
	 */
	private function normalize_for_embedding( string $text ): string {
		$text = html_entity_decode( $text, ENT_QUOTES | ENT_HTML5, 'UTF-8' );

		// Normalize unicode quotes, dashes, ellipsis
		$text = preg_replace( '/[“”]/u', '"', $text );
		$text = preg_replace( '/[‘’]/u', "'", $text );
		$text = preg_replace( '/[–—]/u', '-', $text );
		$text = preg_replace( '/\.{3,}/u', '...', $text );

		// Collapse whitespace but preserve paragraph structure
		$text = preg_replace( '/[ \t]+/', ' ', $text );
		$text = preg_replace( '/\n{3,}/', "\n\n", $text );
		$text = preg_replace( '/[ \t]+\n/', "\n", $text );

		return trim( $text );
	}

	/**
	 * Stringify chunk for LLM inference (injected into system/user prompt).
	 *
	 * Goal: Provide clear, natural, hierarchical context with source attribution.
	 * Designed to feel like a well-structured knowledge excerpt.
	 *
	 * @param  array  $chunk
	 *
	 * @return string Clean, readable context for LLM
	 */
	public function stringify_for_inference( array $chunk ): string {
		$parts = [];

		// Build full hierarchical title
		$post_title      = $chunk['metadata']['post_title'] ?? $chunk['post_title'] ?? '';
		$heading_path    = $chunk['heading_path'] ?? '';
		$current_heading = $chunk['heading'] ?? $chunk['title'] ?? '';

		$full_path = [];
		if ( ! empty( $post_title ) ) {
			$full_path[] = $post_title;
		}
		if ( ! empty( $heading_path ) && $heading_path !== $post_title ) {
			$full_path[] = $heading_path;
		} elseif ( ! empty( $current_heading ) ) {
			$full_path[] = $current_heading;
		}

		if ( ! empty( $full_path ) ) {
			$hierarchy = implode( ' > ', $full_path );
			$parts[]   = "### {$hierarchy}";
		}

		// Main content – preserve formatting naturally
		// Include overlap for better context preservation
		$content = $this->build_content_with_overlap(
			$chunk['content'] ?? '',
			$chunk['overlap_start'] ?? $chunk['metadata']['overlap_start'] ?? null,
			$chunk['overlap_end'] ?? $chunk['metadata']['overlap_end'] ?? null
		);
		if ( ! empty( $content ) ) {
			$parts[] = trim( $content );
		}

		// Source and metadata footer
		$footer     = [];
		$source_url = $chunk['metadata']['source_url'] ?? $chunk['source_url'] ?? null;
		if ( ! empty( $source_url ) ) {
			$footer[] = "Source: {$source_url}";
		}

		$chunk_index = $chunk['chunk_index'] ?? $chunk['metadata']['chunk_index'] ?? null;
		if ( $chunk_index !== null ) {
			$footer[] = "Chunk #{$chunk_index}";
		}

		if ( ! empty( $footer ) ) {
			$parts[] = '[' . implode( ' | ', $footer ) . ']';
		}

		return implode( "\n\n", array_filter( $parts ) );
	}

	/**
	 * Stringify chunk for admin panel display (CRUD, debugging, auditing).
	 *
	 * Goal: Maximum transparency and traceability.
	 *
	 * @param  array  $chunk
	 *
	 * @return string Rich formatted display
	 */
	public function stringify_for_display( array $chunk ): string {
		$lines = [];

		// Header
		$title   = $chunk['title'] ?? $chunk['heading'] ?? 'Untitled Chunk';
		$lines[] = "========================================";
		$lines[] = "CHUNK: {$title}";
		$lines[] = "========================================";

		// Hierarchy
		$heading_path = $chunk['heading_path'] ?? '';
		if ( ! empty( $heading_path ) ) {
			$lines[] = "Full Path: {$heading_path}";
		}

		$post_title = $chunk['metadata']['post_title'] ?? $chunk['post_title'] ?? '';
		if ( ! empty( $post_title ) && $post_title !== $heading_path ) {
			$lines[] = "Document: {$post_title}";
		}

		// Stats
		$tokens = $chunk['tokens'] ?? null;
		if ( $tokens !== null ) {
			$lines[] = "Tokens: ~{$tokens}";
		}

		$chunk_index = $chunk['chunk_index'] ?? $chunk['metadata']['chunk_index'] ?? null;
		if ( $chunk_index !== null ) {
			$lines[] = "Chunk Index: {$chunk_index}";
		}

		$semantic_type = $chunk['metadata']['semantic_type'] ?? 'paragraph';
		$lines[]       = "Semantic Type: " . ucfirst( $semantic_type );

		// Source
		$source_url = $chunk['metadata']['source_url'] ?? $chunk['source_url'] ?? null;
		if ( ! empty( $source_url ) ) {
			$lines[] = "Source URL: {$source_url}";
		}

		// Content preview
		$lines[] = "";
		$lines[] = "--- Content ---";
		$content = $chunk['content'] ?? '';
		$lines[] = empty( $content ) ? '[Empty]' : $content;

		// Overlap indicators (if present)
		if ( ! empty( $chunk['overlap_start'] ?? '' ) ) {
			$lines[] = "";
			$lines[] = "--- Overlap Start ---";
			$lines[] = $chunk['overlap_start'];
		}
		if ( ! empty( $chunk['overlap_end'] ?? '' ) ) {
			$lines[] = "";
			$lines[] = "--- Overlap End ---";
			$lines[] = $chunk['overlap_end'];
		}

		// Full metadata dump
		$metadata = $chunk['metadata'] ?? [];
		if ( ! empty( $metadata ) ) {
			$lines[] = "";
			$lines[] = "--- Raw Metadata ---";
			foreach ( $metadata as $key => $value ) {
				if ( is_array( $value ) ) {
					$value = json_encode( $value, JSON_UNESCAPED_SLASHES );
				}
				$lines[] = "{$key}: {$value}";
			}
		}

		$lines[] = "========================================";

		return implode( "\n", $lines );
	}
}