<?php
// includes/extractor.php
if ( ! defined('ABSPATH') ) { exit; }

/** Normaliza encoding a UTF-8 usando headers, meta y detección heurística */
function urlifywriternormalize_encoding(string $html, array $headers = []) : string {
	$charset = '';

	// 1) Cabecera HTTP
	if (!empty($headers['content-type']) && preg_match('/charset=([\w\-\d]+)/i', $headers['content-type'], $m)) {
		$charset = trim($m[1]);
	}
	// 2) <meta charset=...>
	if (!$charset && preg_match('/<meta[^>]+charset=([\'"]?)([\w\-\d]+)\1/i', $html, $m)) {
		$charset = trim($m[2]);
	}
	// 3) Heurística
	if (!$charset) {
		$det = @mb_detect_encoding($html, 'UTF-8, ISO-8859-1, ISO-8859-15, CP1251, CP1256, CP932, SJIS, EUC-JP, BIG-5, GB18030, GB2312', true);
		if ($det) $charset = $det;
	}
	// Convertir a UTF-8 si hace falta
	if ($charset && strtoupper($charset) !== 'UTF-8') {
		$conv = @mb_convert_encoding($html, 'UTF-8', $charset);
		if ($conv !== false && $conv !== null) $html = $conv;
	}
	return $html;
}

/** Descargar HTML de una URL con wp_remote_get — devuelve UTF-8 o WP_Error */
function urlifywriterfetch_html( string $url ) {
	$url = esc_url_raw($url);
	if ( ! $url || ! preg_match('~^https?://~i', $url) ) {
		return new WP_Error('urlifywriterbad_url', 'Invalid URL');
	}

	$origin = preg_replace('~^(https?://[^/]+).*$~i', '$1/', $url) ?: $url;

	// Helper: construir cabeceras. Si existen helpers globales, úsalos.
	$headers_identity = function() use ($url, $origin){
		if ( function_exists('urlifywriterautoscan_build_headers') ) {
			$h = urlifywriterautoscan_build_headers($url);
			$h['User-Agent'] = 'UrlifyWriter/1.0 (+https://urlifywriter.com)';
			return $h;
		}
		return [
			'User-Agent'      => 'UrlifyWriter/1.0 (+https://urlifywriter.com)',
			'Accept'          => 'text/html,application/xhtml+xml',
			'Accept-Language' => 'en-US,en;q=0.9,es-ES;q=0.8',
			'Referer'         => $origin,
			'Cache-Control'   => 'no-cache',
			'Pragma'          => 'no-cache',
			'Connection'      => 'keep-alive',
		];
	};

	$headers_browser = function() use ($url, $origin){
		$ua = function_exists('urlifywriterautoscan_random_ua')
			? urlifywriterautoscan_random_ua()
			: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0 Safari/537.36';
		$lang = function_exists('urlifywriterautoscan_accept_lang_from_wp')
			? urlifywriterautoscan_accept_lang_from_wp()
			: 'en-US,en;q=0.9,es-ES;q=0.8';

		return [
			'User-Agent'      => $ua,
			'Accept'          => 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
			'Accept-Language' => $lang,
			'Referer'         => $origin,
			'Cache-Control'   => 'no-cache',
			'Pragma'          => 'no-cache',
			'Connection'      => 'keep-alive',
		];
	};

	$args_common = [
		'timeout'            => 12,
		'redirection'        => 5,
		'reject_unsafe_urls' => true,
		'sslverify'          => true,
		'blocking'           => true,
	];

	// Perfil 1: identidad AutoPress
	$res = wp_remote_get($url, $args_common + [ 'headers' => $headers_identity() ]);
	if ( is_wp_error($res) ) return $res;

	$code = (int) wp_remote_retrieve_response_code($res);
	$body = (string) wp_remote_retrieve_body($res);

	// Reintento con "navegador" si bloqueo/empty
	if ( in_array($code, [403,406,429,451,503], true) || $body === '' ) {
		// pequeño jitter
		usleep(200000 + random_int(0, 250000)); // 200–450 ms

		$res  = wp_remote_get($url, $args_common + [ 'headers' => $headers_browser() ]);
		if ( is_wp_error($res) ) return $res;
		$code = (int) wp_remote_retrieve_response_code($res);
		$body = (string) wp_remote_retrieve_body($res);
	}

	if ( $code < 200 || $code >= 400 ) {
		return new WP_Error('urlifywriterhttp', 'HTTP status ' . $code);
	}
	if ( $body === '' ) {
		return new WP_Error('urlifywriterempty', 'Empty body');
	}

	// Normalización de encoding a UTF-8
	$ctype = wp_remote_retrieve_header($res, 'content-type');
	return urlifywriter_normalize_utf8_body($body, $ctype);
}

/**
 * Normaliza el cuerpo a UTF-8 usando cabecera Content-Type si está.
 * Queda como helper local por si no existe urlifywriternormalize_encoding en tu proyecto.
 */
if ( ! function_exists('urlifywriter_normalize_utf8_body') ) {
	function urlifywriter_normalize_utf8_body( string $body, $content_type_header = '' ) {
		$charset = '';
		if ( is_string($content_type_header) && preg_match('~charset=([^\s;]+)~i', $content_type_header, $m) ) {
			$charset = strtoupper(trim($m[1]));
		}
		if ( $charset && $charset !== 'UTF-8' ) {
			$converted = @mb_convert_encoding($body, 'UTF-8', $charset);
			if ( $converted !== false ) return $converted;
		}
		// Heurística si no hay charset o falló la conversión
		$enc = @mb_detect_encoding($body, 'UTF-8, ISO-8859-1, Windows-1252, ASCII', true);
		if ( $enc && strtoupper($enc) !== 'UTF-8' ) {
			$converted = @mb_convert_encoding($body, 'UTF-8', $enc);
			if ( $converted !== false ) return $converted;
		}
		// Último recurso: asegurar string válido
		return @mb_convert_encoding($body, 'UTF-8', 'UTF-8');
	}
}


/** Parsear HTML a DOMDocument forzando UTF-8 + entidades (evita acentos rotos) */
function urlifywriterhtml_to_dom( string $html ) : ?DOMDocument {
	libxml_use_internal_errors(true);
	$dom = new DOMDocument('1.0', 'UTF-8');

	// Asegurar UTF-8 y convertir a entidades HTML antes de parsear
	if ( ! mb_detect_encoding($html, 'UTF-8', true) ) {
		$html = @mb_convert_encoding($html, 'UTF-8');
	}
	$html = @mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8');

	// Truco: prefijar declaración para forzar UTF-8 en DOMDocument
	$loaded = $dom->loadHTML('<?xml encoding="UTF-8">' . $html, LIBXML_NOWARNING | LIBXML_NOERROR);
	libxml_clear_errors();
	return $loaded ? $dom : null;
}

/** Localiza el nodo principal del contenido (heurística simple) */
function urlifywriterfind_main_node( DOMXPath $xp, DOMDocument $dom ) : ?DOMNode {
	$candidates = [
		'//article','//main','//*[@id="content"]','//*[@id="main"]',
		'//*[contains(@class,"entry-content")]','//*[contains(@class,"post-content")]',
		'//*[contains(@class,"content__body")]','//*[contains(@class,"article-body")]',
	];
	foreach ( $candidates as $q ) {
		$nodes = $xp->query( $q );
		if ( $nodes && $nodes->length > 0 ) return $nodes->item(0);
	}
	$body = $xp->query( '//body' );
	return ( $body && $body->length ) ? $body->item(0) : null;
}

/** Elimina ruido y extrae SOLO texto de headings/párrafos (decodifica entidades) */
function urlifywriterextract_text_blocks( DOMNode $root ) : array {
	// Eliminar nodos no textuales/ruido
	$remove = ['script','style','nav','aside','footer','form','noscript','iframe','figure'];
	$walker = function( DOMNode $node ) use ( &$walker, $remove ) {
		if ( $node->hasChildNodes() ) {
			$children = [];
			foreach ( $node->childNodes as $child ) { $children[] = $child; }
			foreach ( $children as $child ) {
				if ( $child->nodeType === XML_ELEMENT_NODE && in_array( strtolower( $child->nodeName ), $remove, true ) ) {
					$node->removeChild( $child );
				} else {
					$walker( $child );
				}
			}
		}
	};
	$walker( $root );

	$blocks  = [];
	$allowed = ['p','h1','h2','h3','h4','h5'];

	$it = new RecursiveIteratorIterator(new UrlifyWriter_RecursiveDOMIterator($root), RecursiveIteratorIterator::SELF_FIRST);

	foreach ( $it as $n ) {
		if ( $n->nodeType !== XML_ELEMENT_NODE ) continue;
		$tag = strtolower( $n->nodeName );
		if ( ! in_array( $tag, $allowed, true ) ) continue;

		// textContent en UTF-8, decodifica entidades (&aacute; → á) y NBSP
		$text = $n->textContent ?? '';
		$text = html_entity_decode($text, ENT_QUOTES | ENT_HTML5, 'UTF-8');
		$text = preg_replace('/\x{00A0}/u', ' ', $text); // NBSP → espacio
		$text = trim( preg_replace( '/\s+/u', ' ', $text ) );

		// descarta ruido muy corto
		if ( $text === '' || mb_strlen( $text ) < 30 ) continue;

		$blocks[] = [
			'type' => ( $tag === 'p' ) ? 'p' : 'h',
			'tag'  => $tag,
			'text' => $text,
		];
	}
	return $blocks;
}

/** Detección de scripts CJK/Thai/Lao/Khmer para truncar por caracteres */
function urlifywriteris_cjk_like( string $text ) : bool {
	return (bool) preg_match('/[\x{3040}-\x{30FF}\x{3100}-\x{312F}\x{3130}-\x{318F}\x{3400}-\x{9FFF}\x{AC00}-\x{D7AF}\x{0E00}-\x{0E7F}\x{0E80}-\x{0EFF}\x{1780}-\x{17FF}]/u', $text);
}

/** Conteo de palabras (lenguas con espacios) */
function urlifywritercount_words_utf8( string $text ) : int {
	$parts = preg_split( '/\s+/u', trim( $text ) );
	if ( ! $parts || $parts === false ) return 0;
	$parts = array_filter( $parts, fn( $w ) => $w !== '' );
	return count( $parts );
}

/** Truncar bloques por límite: palabras (normal) o caracteres (CJK-like) */
function urlifywritertruncate_blocks_by_limit( array $blocks, int $max_words_total ) : array {
	$sampler = '';
	foreach ( $blocks as $b ) { $sampler .= ' ' . ( $b['text'] ?? '' ); if ( mb_strlen( $sampler ) > 400 ) break; }
	$is_cjk = urlifywriteris_cjk_like( $sampler );

	$out   = [];
	$count = 0;
	$limit = $is_cjk ? (int) floor( $max_words_total * 1.8 ) : $max_words_total;

	foreach ( $blocks as $b ) {
		$len = $is_cjk ? mb_strlen( $b['text'] ) : urlifywritercount_words_utf8( $b['text'] );
		if ( $count + $len > $limit ) break;
		$out[] = $b;
		$count += $len;
	}
	return $out;
}

/** Detecta si el texto debe renderizarse RTL (árabe/hebreo y afines) */
function urlifywriteris_rtl_text( string $text ) : bool {
	return (bool) preg_match('/[\x{0590}-\x{08FF}]/u', $text);
}

/**
 * API principal: título, bloques, excerpt y flag RTL
 * @return array{ title:string, blocks:array<int,array{type:string,tag:string,text:string}>, excerpt:string, rtl:bool }|WP_Error
 */
function urlifywriterextract_from_url_text( string $url ) {
	$html = urlifywriterfetch_html( $url );
	if ( is_wp_error( $html ) ) return $html;

	$dom = urlifywriterhtml_to_dom( $html );
	if ( ! $dom ) return new WP_Error( 'urlifywriterdom', 'Cannot parse HTML' );

	$xp = new DOMXPath( $dom );

	// Título (decodificar entidades)
	$title = '';
	$nodesTitle = $xp->query( '//title' );
	if ( $nodesTitle && $nodesTitle->length ) {
		$t = $nodesTitle->item(0)->textContent;
		$t = html_entity_decode($t, ENT_QUOTES | ENT_HTML5, 'UTF-8');
		$title = trim( preg_replace( '/\s+/u', ' ', $t ) );
	}

	$main = urlifywriterfind_main_node( $xp, $dom );
	if ( ! $main ) return new WP_Error( 'urlifywritermain', 'Main content not found' );

	$blocks = urlifywriterextract_text_blocks( $main );
	if ( empty( $blocks ) ) return new WP_Error( 'urlifywriterempty_content', 'No article text found' );

	$max_words = intval( get_option( 'urlifywritermax_words', 1200 ) );
	if ( $max_words < 200 )  $max_words = 200;
	if ( $max_words > 5000 ) $max_words = 5000;

	$blocks  = urlifywritertruncate_blocks_by_limit( $blocks, $max_words );
	$excerpt = '';
	foreach ( $blocks as $b ) { if ( $b['type'] === 'p' ) { $excerpt = $b['text']; break; } }

	$rtl = urlifywriteris_rtl_text( $title . ' ' . $excerpt );

	return [
		'title'   => $title,
		'blocks'  => $blocks,
		'excerpt' => $excerpt,
		'rtl'     => $rtl,
	];
}

/** Iterador DOM recursivo */
class UrlifyWriter_RecursiveDOMIterator implements RecursiveIterator {
	private $position;
	private $nodeList;
	public function __construct( DOMNode $domNode ) {
		$this->position = 0;
		$this->nodeList = [];
		if ( $domNode->hasChildNodes() ) {
			foreach ( $domNode->childNodes as $child ) { $this->nodeList[] = $child; }
		}
	}
	public function current(): mixed { return $this->nodeList[ $this->position ]; }
	public function key(): mixed     { return $this->position; }
	public function next(): void     { $this->position++; }
	public function rewind(): void   { $this->position = 0; }
	public function valid(): bool    { return isset( $this->nodeList[ $this->position ] ); }
	public function hasChildren(): bool { return $this->current()->hasChildNodes(); }
	public function getChildren(): UrlifyWriter_RecursiveDOMIterator { return new self( $this->current() ); }
}