<?php
/**
 * UrlifyWriter — AutoScan Crawler
 *
 * Objetivo:
 * - Detección inteligente de enlaces "artículo" desde portada (HTML) o, si existe, desde RSS/Atom.
 * - Filtrado de paginación/archivo/anuncios.
 * - Priorización simple por posición/fecha y señales (etiquetas <article>, encabezados, JSON-LD Article).
 *
 * API principal que usa autoscan-manager.php:
 *   urlifywriterautoscan_extract_links(string $html, string $base_url): array< [url, title, priority] >
 *
 * NOTAS:
 * - Este archivo NO inserta en BD ni llama a generadores. Solo devuelve candidatos.
 * - Si detecta feeds <link rel="alternate" type="application/rss+xml|atom+xml"> intenta usarlos.
 */

if ( ! defined('ABSPATH') ) { exit; }

/* ============================================================
 * Helpers de URL (copias ligeras para no romper si no se cargan otros archivos)
 * ============================================================ */

if ( ! function_exists('urlifywriterautoscan_norm_url') ) {
	function urlifywriterautoscan_norm_url( $url ) {
		$url = trim( (string) $url );
		if ( $url === '' ) return '';

		// Completar esquema si hace falta
		if ( strpos($url, '//') === 0 ) $url = (is_ssl() ? 'https:' : 'http:') . $url;
		if ( ! preg_match('~^https?://~i', $url) ) $url = 'https://' . ltrim($url, '/');

		// Quitar trackers comunes
		$p = wp_parse_url($url);
		if ( empty($p['host']) ) return '';

		$q = [];
		if ( ! empty($p['query']) ) {
			parse_str($p['query'], $q);
			foreach ($q as $k => $v) {
				if ( preg_match('/^(utm_|fbclid|gclid|yclid|mc_cid|mc_eid)/i', $k) ) unset($q[$k]);
			}
		}
		$scheme = isset($p['scheme']) ? strtolower($p['scheme']) : 'https';
		$host   = strtolower($p['host']);
		$path   = isset($p['path']) ? $p['path'] : '/';
		$qs     = $q ? ('?' . http_build_query($q)) : '';
		$port   = isset($p['port']) ? ':' . $p['port'] : '';
		return $scheme.'://'.$host.$port.$path.$qs;
	}
}

if ( ! function_exists('urlifywriterautoscan_abs_url') ) {
	function urlifywriterautoscan_abs_url( $maybe, $base ) {
		if ( preg_match('~^https?://~i', $maybe) ) return $maybe;
		// Si existe helper global del plugin, úsalo:
		if ( function_exists('urlifywritermake_absolute_url') ) {
			return urlifywritermake_absolute_url($maybe, $base);
		}
		// Fallback básico
		$bp = wp_parse_url($base);
		if ( ! $bp || empty($bp['host']) ) return $maybe;
		$scheme = $bp['scheme'] ?? 'https';
		$host   = $bp['host'];
		$port   = isset($bp['port']) ? ':' . $bp['port'] : '';
		$bpath  = $bp['path'] ?? '/';

		if ( strpos($maybe, '/') === 0 ) {
			$path = $maybe;
		} else {
			$dir  = preg_replace('~/[^/]*$~', '/', $bpath ?: '/');
			$path = $dir . $maybe;
		}
		$parts = [];
		foreach (explode('/', $path) as $seg) {
			if ($seg === '' || $seg === '.') { $parts[] = $seg === '' ? '' : null; continue; }
			if ($seg === '..') { if (count($parts) > 1) array_pop($parts); continue; }
			$parts[] = $seg;
		}
		$canon = implode('/', array_filter($parts, static fn($v) => $v !== null));
		return $scheme . '://' . $host . $port . $canon;
	}
}

/* ============================================================
 * Filtros de candidatos (mismos criterios que en manager)
 * ============================================================ */
if ( ! function_exists('urlifywriterautoscan_is_candidate_path') ) {
	function urlifywriterautoscan_is_candidate_path( $url, $base_host ) {
		$HP = wp_parse_url( $url );
		if ( empty($HP['host']) || strtolower($HP['host']) !== strtolower($base_host) ) return false;

		$path = isset($HP['path']) ? $HP['path'] : '/';

		// Ignorar home o raíz
		if ( rtrim($path, '/') === '' || rtrim($path, '/') === '/' ) return false;

		// Extensiones a evitar
		if ( preg_match('~\.(jpg|jpeg|png|webp|gif|pdf|zip|mp3|mp4|avi|mov)$~i', $path) ) return false;

		// Rutas típicas de archivo/paginación/ads
		$bad = [
			'/page/', '/tag/', '/tags/', '/category/', '/categories/',
			'/author/', '/search', '/buscar', '/ads', '/advert', '/sponsor',
			'/feed', '/rss', '/atom',
		];
		foreach ($bad as $needle) {
			if ( stripos($path, $needle) !== false ) return false;
		}
		return true;
	}
}

/* ============================================================
 * 1) Detección de feeds en HTML
 * ============================================================ */
if ( ! function_exists('urlifywriterautoscan_find_feeds') ) {
	function urlifywriterautoscan_find_feeds( $html, $base_url ) {
		$feeds = [];
		if ( ! is_string($html) || $html === '' ) return $feeds;

		// Buscar <link rel="alternate" type="application/rss+xml|atom+xml" href="...">
		if ( preg_match_all('~<link[^>]+rel=["\']alternate["\'][^>]+>~i', $html, $tags) ) {
			foreach ($tags[0] as $tag) {
				$type = ''; $href = '';
				if ( preg_match('~type=["\']([^"\']+)["\']~i', $tag, $m) ) {
					$type = strtolower(trim($m[1]));
				}
				if ( preg_match('~href=["\']([^"\']+)["\']~i', $tag, $m) ) {
					$href = trim(html_entity_decode($m[1]));
				}
				if ( $href && ($type === 'application/rss+xml' || $type === 'application/atom+xml') ) {
					$feeds[] = urlifywriterautoscan_abs_url($href, $base_url);
				}
			}
		}

		// Extras comunes si no encontramos nada
		if ( empty($feeds) ) {
			$bp = wp_parse_url($base_url);
			if ( ! empty($bp['scheme']) && ! empty($bp['host']) ) {
				$root = $bp['scheme'].'://'.$bp['host'] . (isset($bp['port'])?':'.$bp['port']:'');
				$feeds = [
					$root . '/feed/',
					$root . '/rss/',
					$root . '/atom.xml',
				];
			}
		}

		// Normalizar y deduplicar
		$feeds = array_values(array_unique(array_filter(array_map('urlifywriterautoscan_norm_url', $feeds))));
		return $feeds;
	}
}

/* ============================================================
 * 2) Descarga y parseo de feed RSS/Atom
 * ============================================================ */
if ( ! function_exists('urlifywriterautoscan_fetch') ) {
	function urlifywriterautoscan_fetch( $url, $timeout = 18 ) {
		$headers = urlifywriterautoscan_build_headers( $url );
		$resp = wp_remote_get( $url, [
			'timeout'            => max(5, (int)$timeout),
			'redirection'        => 5,
			'headers'            => $headers,
			'reject_unsafe_urls' => true,
			'sslverify'          => true,
		] );
		if ( is_wp_error($resp) ) return $resp;
		$code = wp_remote_retrieve_response_code($resp);
		if ( $code < 200 || $code >= 400 ) {
			return new WP_Error('http_error', 'HTTP '.$code);
		}
		$body = wp_remote_retrieve_body($resp);
		return is_string($body) ? $body : '';
	}
}

if ( ! function_exists('urlifywriterautoscan_parse_feed') ) {
	function urlifywriterautoscan_parse_feed( $xml_text ) {
		$items = [];
		if ( ! is_string($xml_text) || $xml_text === '' ) return $items;

		libxml_use_internal_errors(true);
		$xml = simplexml_load_string($xml_text);
		if ( ! $xml ) return $items;

		// RSS 2.0
		if ( isset($xml->channel->item) ) {
			foreach ($xml->channel->item as $it) {
				$link  = (string) ($it->link ?? '');
				$title = (string) ($it->title ?? '');
				$date  = (string) ($it->pubDate ?? '');
				$items[] = [
					'url'       => $link,
					'title'     => $title,
					'pub_ts'    => $date ? strtotime($date) : 0,
					'priority'  => 0, // se calcula luego
					'_source'   => 'rss',
				];
			}
			return $items;
		}

		// Atom 1.0
		if ( isset($xml->entry) ) {
			foreach ($xml->entry as $entry) {
				$link = '';
				if ( isset($entry->link) ) {
					foreach ($entry->link as $ln) {
						$attrs = $ln->attributes();
						if ( isset($attrs['href']) ) {
							$link = (string) $attrs['href'];
							break;
						}
					}
				}
				$title = (string) ($entry->title ?? '');
				$date  = (string) ($entry->updated ?? $entry->published ?? '');
				$items[] = [
					'url'       => $link,
					'title'     => $title,
					'pub_ts'    => $date ? strtotime($date) : 0,
					'priority'  => 0,
					'_source'   => 'atom',
				];
			}
		}

		return $items;
	}
}

/* ============================================================
 * 3) Heurística de scoring
 * ============================================================ */
if ( ! function_exists('urlifywriterautoscan_score') ) {
	function urlifywriterautoscan_score( $pos, $pub_ts = 0, $signals = 0 ) {
		// base por posición (más arriba = más puntos)
		$posScore = max(0, 100 - min(99, (int)$pos));

		// bonus por frescura (últimas 48h)
		$freshScore = 0;
		if ( $pub_ts > 0 ) {
			$hrs = max(0, (time() - $pub_ts) / 3600.0);
			if ( $hrs <= 48 ) {
				$freshScore = (int) max(0, 48 - $hrs); // 0..48
			}
		}

		// señales (article tag, h2 dentro de .post, JSON-LD Article, etc.)
		$signalScore = max(0, min(30, (int)$signals * 10)); // 0,10,20,30

		return (float) ($posScore + $freshScore + $signalScore);
	}
}

/* ============================================================
 * 4) Extracción desde HTML (sin feed)
 * ============================================================ */
if ( ! function_exists('urlifywriterautoscan_extract_from_html') ) {
	function urlifywriterautoscan_extract_from_html( $html, $base_url ) {
		$out   = [];
		$host  = wp_parse_url($base_url, PHP_URL_HOST);
		if ( ! $host ) return $out;

		$doc = new DOMDocument();
		libxml_use_internal_errors(true);
		// Evitar destrucción por encoding raro
		$enc   = function_exists('mb_detect_encoding') ? (mb_detect_encoding($html, 'UTF-8, ISO-8859-1, ASCII', true) ?: 'UTF-8') : 'UTF-8';
		$htmlu = (strtoupper($enc) !== 'UTF-8' && function_exists('mb_convert_encoding')) ? mb_convert_encoding($html, 'UTF-8', $enc) : $html;
		@$doc->loadHTML($htmlu);
		$xp = new DOMXPath($doc);

		$seen = [];
		$pos  = 0;

		// 1) Artículos marcados con <article> y enlaces dentro de encabezados
		foreach ($xp->query('//article//*[self::h1 or self::h2 or self::h3]/a[@href]') as $a) {
			$href  = trim($a->getAttribute('href'));
			$text  = trim($a->textContent);
			if ( ! $href ) continue;

			$url = urlifywriterautoscan_norm_url( urlifywriterautoscan_abs_url($href, $base_url) );
			if ( ! $url || isset($seen[$url]) ) continue;
			if ( ! urlifywriterautoscan_is_candidate_path($url, $host) ) continue;

			$score = urlifywriterautoscan_score($pos, 0, 2); // señal fuerte por estar bajo <article>/<h2|h3>
			$out[] = ['url'=>$url, 'title'=>$text, 'priority'=>$score];
			$seen[$url] = true;
			$pos++;
		}

		// 2) Títulos visibles .post-title a/h2/a, h2/a, h3/a
		foreach ([
			'//h2/a[@href]',
			'//h3/a[@href]',
			'//*[@class and contains(concat(" ", normalize-space(@class), " "), " post-title ")]/a[@href]',
		] as $q) {
			foreach ($xp->query($q) as $a) {
				$href = trim($a->getAttribute('href'));
				$text = trim($a->textContent);
				if ( ! $href ) continue;

				$url = urlifywriterautoscan_norm_url( urlifywriterautoscan_abs_url($href, $base_url) );
				if ( ! $url || isset($seen[$url]) ) continue;
				if ( ! urlifywriterautoscan_is_candidate_path($url, $host) ) continue;

				$score = urlifywriterautoscan_score($pos, 0, 1);
				$out[] = ['url'=>$url, 'title'=>$text, 'priority'=>$score];
				$seen[$url] = true;
				$pos++;
			}
		}

		// 3) Cualquier <a> prominente en listas de portada
		foreach ($xp->query('//ul/li/a[@href] | //div[contains(@class,"list") or contains(@class,"entries")]//a[@href]') as $a) {
			$href = trim($a->getAttribute('href'));
			$text = trim($a->textContent);
			if ( ! $href ) continue;

			$url = urlifywriterautoscan_norm_url( urlifywriterautoscan_abs_url($href, $base_url) );
			if ( ! $url || isset($seen[$url]) ) continue;
			if ( ! urlifywriterautoscan_is_candidate_path($url, $host) ) continue;

			$score = urlifywriterautoscan_score($pos, 0, 0);
			$out[] = ['url'=>$url, 'title'=>$text, 'priority'=>$score];
			$seen[$url] = true;
			$pos++;
		}

		// 4) Bonus por JSON-LD Article (si se puede identificar rápidamente)
		//    No extraemos here, solo mejoramos prioridad si coincide con alguna URL ya capturada.
		foreach ($xp->query('//script[@type="application/ld+json"]') as $script) {
			$json = trim($script->textContent);
			if ( $json === '' ) continue;
			$data = json_decode($json, true);
			if ( ! $data ) continue;

			$is_assoc = is_array( $data ?? [] ) && array_keys( $data ) !== range( 0, count( $data ) - 1 );
            $nodes = $is_assoc ? array( $data ) : (array) $data;

			foreach ($nodes as $node) {
				if ( ! is_array($node) ) continue;
				$ctype = isset($node['@type']) ? (array)$node['@type'] : [];
				$ctype = array_map('strtolower', $ctype);
				if ( empty($ctype) && isset($node['@type']) && is_string($node['@type']) ) {
					$ctype = [ strtolower($node['@type']) ];
				}
				$isArticle = in_array('article', $ctype, true) || in_array('newsarticle', $ctype, true) || in_array('blogposting', $ctype, true);
				if ( ! $isArticle ) continue;

				$u = '';
				if ( !empty($node['url']) ) $u = (string)$node['url'];
				elseif ( !empty($node['mainEntityOfPage']) && is_string($node['mainEntityOfPage']) ) $u = (string)$node['mainEntityOfPage'];
				if ( ! $u ) continue;

				$u = urlifywriterautoscan_norm_url( urlifywriterautoscan_abs_url($u, $base_url) );
				if ( ! $u || ! isset($seen[$u]) ) continue;

				// Encontrar y subir prioridad
				foreach ($out as &$row) {
					if ( $row['url'] === $u ) {
						$row['priority'] += 15.0;
						break;
					}
				}
				unset($row);
			}
		}

		// Ordenar por prioridad desc y saneo de títulos
		foreach ($out as &$row) {
			$t = trim( preg_replace('/\s+/', ' ', wp_strip_all_tags($row['title'])) );
			$row['title'] = $t;
		}
		unset($row);

		usort($out, function($a,$b){
			if ($a['priority'] === $b['priority']) return 0;
			return ($a['priority'] > $b['priority']) ? -1 : 1;
		});

		// Limitar a 100 candidatos para no sobrecargar
		return array_slice($out, 0, 100);
	}
}


/* ============================================================
 * 5) Orquestador principal
 * ============================================================ */
if ( ! function_exists('urlifywriterautoscan_extract_links') ) {
	/**
	 * @param string $html      HTML de la portada
	 * @param string $base_url  URL base (dominio)
	 * @return array<int, array{url:string,title:string,priority:float}>
	 */
	function urlifywriterautoscan_extract_links( $html, $base_url ) {
		$base_url = urlifywriterautoscan_norm_url($base_url);
		if ( ! $base_url ) return [];

		$host = wp_parse_url($base_url, PHP_URL_HOST);
		if ( ! $host ) return [];

		$candidates = [];

		// 1) Intentar feeds primero (si los hay): suelen ser limpios y con fechas
		$feeds = urlifywriterautoscan_find_feeds($html, $base_url);
		foreach ($feeds as $fx) {
			$xml = urlifywriterautoscan_fetch($fx, 12);
			if ( is_wp_error($xml) || !is_string($xml) || $xml === '' ) continue;

			$items = urlifywriterautoscan_parse_feed($xml);
			$pos = 0;
			foreach ($items as $it) {
				$u = urlifywriterautoscan_norm_url( urlifywriterautoscan_abs_url($it['url'], $base_url) );
				if ( ! $u ) continue;
				if ( ! urlifywriterautoscan_is_candidate_path($u, $host) ) continue;

				$score = urlifywriterautoscan_score($pos, (int)$it['pub_ts'], 2);
				$candidates[] = ['url'=>$u, 'title'=> (string)$it['title'], 'priority'=>$score];
				$pos++;
			}
			// Con un feed válido nos basta; no seguimos buscando más feeds
			if ( ! empty($items) ) break;
		}

		// 2) Si no hay items vía feed, caemos al HTML
		if ( empty($candidates) ) {
			$candidates = urlifywriterautoscan_extract_from_html($html, $base_url);
		}

		// 3) Deduplicar por URL
		$seen = [];
		$out  = [];
		foreach ($candidates as $row) {
			$u = $row['url'];
			if ( isset($seen[$u]) ) continue;
			$seen[$u] = true;
			$out[] = $row;
		}

		return $out;
	}
}
