Problema: - URLs en analytics mostraban dominio incorrecto (HTTP_HOST) - URLs usaban formato /?p=ID en lugar de permalinks Solución: - class-search-engine.php: Agregar propiedad $prefix, incluir p.post_name en 5 queries fetch, agregar helpers getSiteUrlFromDb(), getPermalinkStructure() y buildPermalink() - search-endpoint.php: Obtener site_url y permalink_structure desde wp_options, construir URLs con post_name - click-endpoint.php: Fallback de dest usando post_name desde BD Archivos modificados: - includes/class-search-engine.php - api/search-endpoint.php - api/click-endpoint.php 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
827 lines
26 KiB
PHP
827 lines
26 KiB
PHP
<?php
|
|
/**
|
|
* Hybrid Search Engine with multi-bucket scoring
|
|
*
|
|
* @package ROI_APU_Search
|
|
*/
|
|
|
|
declare(strict_types=1);
|
|
|
|
// Prevent direct access
|
|
if (!defined('ABSPATH')) {
|
|
exit;
|
|
}
|
|
|
|
/**
|
|
* Search engine with hybrid multi-bucket algorithm and category filtering
|
|
*/
|
|
final class ROI_APU_Search_Engine
|
|
{
|
|
private PDO $pdo;
|
|
private string $prefix;
|
|
private string $posts_table;
|
|
private string $term_rel_table;
|
|
private string $term_tax_table;
|
|
|
|
// Scoring weights (same as original)
|
|
private const RAW_REL_MULT = 40.0;
|
|
private const W_COVERAGE = 200.0;
|
|
private const W_STARTSWITH = 240.0;
|
|
private const W_WORD_EXACT = 140.0;
|
|
private const W_FUZZY_TOKEN_MAX = 120.0;
|
|
private const W_RECENCY_MAX = 120.0;
|
|
private const W_PROX_CHARS = 620.0;
|
|
private const W_ORDERED_WINDOW = 1600.0;
|
|
private const W_ORDERED_ANCHOR = 300.0;
|
|
private const LEN_PEN_START = 180;
|
|
private const LEN_PEN_PER_CHAR = 0.55;
|
|
private const REQ_MISS_PER_TOKEN = 420.0;
|
|
private const REQ_BASE_PENALTY = 140.0;
|
|
|
|
/**
|
|
* Constructor
|
|
*/
|
|
public function __construct(PDO $pdo)
|
|
{
|
|
$this->pdo = $pdo;
|
|
|
|
global $table_prefix;
|
|
$prefix = $table_prefix ?? 'wp_';
|
|
|
|
$this->prefix = $prefix;
|
|
$this->posts_table = $prefix . 'posts';
|
|
$this->term_rel_table = $prefix . 'term_relationships';
|
|
$this->term_tax_table = $prefix . 'term_taxonomy';
|
|
}
|
|
|
|
/**
|
|
* Run the search
|
|
*
|
|
* @param string $term Search term
|
|
* @param int $limit Results per page
|
|
* @param int $offset Pagination offset
|
|
* @param array $category_ids Optional category IDs to filter by
|
|
* @return array Search results with total, rows, mode, time
|
|
*/
|
|
public function run(string $term, int $limit, int $offset, array $category_ids = []): array
|
|
{
|
|
$t0 = microtime(true);
|
|
|
|
// Try Redis cache first
|
|
$redis = ROI_APU_Search_Redis::get_instance();
|
|
$cacheKey = $redis->generateKey($term, $limit, $offset, $category_ids);
|
|
$cached = $redis->get($cacheKey);
|
|
if ($cached !== null) {
|
|
$cached['time_ms'] = round((microtime(true) - $t0) * 1000, 2);
|
|
$cached['cached'] = true;
|
|
return $cached;
|
|
}
|
|
|
|
$tokens = self::tokens($term);
|
|
|
|
// Pool sizes
|
|
$capFull = max(120, min(300, $limit * 8));
|
|
$capLike = max(120, min(300, $limit * 8));
|
|
$capPref = max(60, min(200, $limit * 4));
|
|
$capEq = min(40, $limit * 2);
|
|
$capCont = max(80, min(240, $limit * 6));
|
|
|
|
// Fetch from all buckets
|
|
// Note: Skip CONTAINS when only 1 token since LIKE_ALL already does '%token%'
|
|
$buckets = [
|
|
['name' => 'LIKE_ALL', 'base' => 900.0, 'rows' => $this->fetchAllTokensLike($tokens, $capLike, $category_ids)],
|
|
['name' => 'FULLTEXT', 'base' => 700.0, 'rows' => $this->fetchFulltextTitle($term, $capFull, $category_ids)],
|
|
['name' => 'STARTS', 'base' => 650.0, 'rows' => $this->fetchStartsWith($term, $capPref, $category_ids)],
|
|
['name' => 'EQUALS', 'base' => 1200.0, 'rows' => $this->fetchEquals($term, $capEq, $category_ids)],
|
|
];
|
|
|
|
// Only add CONTAINS bucket when multiple tokens (otherwise LIKE_ALL is equivalent)
|
|
if (count($tokens) > 1) {
|
|
$buckets[] = ['name' => 'CONTAINS', 'base' => 500.0, 'rows' => $this->fetchContains($term, $capCont, $category_ids)];
|
|
}
|
|
|
|
// Deduplicate by normalized title
|
|
$seen = [];
|
|
$pool = [];
|
|
foreach ($buckets as $b) {
|
|
foreach ($b['rows'] as $r) {
|
|
$norm = self::normTitle($r['post_title']);
|
|
if (isset($seen[$norm])) {
|
|
continue;
|
|
}
|
|
$seen[$norm] = true;
|
|
|
|
$pool[] = [
|
|
'ID' => (int) $r['ID'],
|
|
'post_title' => (string) $r['post_title'],
|
|
'post_date' => (string) $r['post_date'],
|
|
'post_name' => (string) ($r['post_name'] ?? ''),
|
|
'bucket' => $b['name'],
|
|
'baseW' => (float) $b['base'],
|
|
'raw_rel' => isset($r['raw_rel']) ? (float) $r['raw_rel'] : 0.0,
|
|
];
|
|
}
|
|
}
|
|
|
|
$poolTotal = count($pool);
|
|
if ($poolTotal === 0) {
|
|
$elapsed = round((microtime(true) - $t0) * 1000, 2);
|
|
return ['total' => 0, 'rows' => [], 'modo' => 'HYBRID', 'time_ms' => $elapsed];
|
|
}
|
|
|
|
// Re-rank with scoring signals
|
|
foreach ($pool as &$it) {
|
|
$title = $it['post_title'];
|
|
$date = $it['post_date'];
|
|
$rawRel = $it['raw_rel'];
|
|
$baseW = $it['baseW'];
|
|
|
|
$score = $baseW
|
|
+ ($rawRel * self::RAW_REL_MULT)
|
|
+ self::coverageBoost($title, $tokens)
|
|
+ self::orderedWindowBoost($title, $tokens)
|
|
+ self::proximityBoost($title, $tokens)
|
|
+ self::startsWithBoost($title, $term)
|
|
+ self::wordExactBoost($title, $term)
|
|
+ (self::levenshteinSimilarity($title, $term) * 160.0)
|
|
+ self::tokenFuzzyBoost($title, $tokens)
|
|
+ self::recencyBoost($date)
|
|
+ self::lengthPenalty($title)
|
|
+ self::requiredTokensPenalty($title, $tokens);
|
|
|
|
$it['score'] = $score;
|
|
}
|
|
unset($it);
|
|
|
|
// Sort by score
|
|
usort($pool, function ($a, $b) {
|
|
if ($a['score'] === $b['score']) {
|
|
return strcmp($b['post_date'], $a['post_date']);
|
|
}
|
|
return ($a['score'] < $b['score']) ? 1 : -1;
|
|
});
|
|
|
|
// Paginate
|
|
$pageRows = array_slice($pool, $offset, $limit);
|
|
|
|
$rows = array_map(fn($r) => [
|
|
'ID' => $r['ID'],
|
|
'post_title' => $r['post_title'],
|
|
'post_date' => $r['post_date'],
|
|
'post_name' => $r['post_name'] ?? '',
|
|
'permalink' => '', // Se construirá en search-endpoint.php
|
|
], $pageRows);
|
|
|
|
$elapsed = round((microtime(true) - $t0) * 1000, 2);
|
|
|
|
$result = [
|
|
'total' => $poolTotal,
|
|
'rows' => $rows,
|
|
'modo' => 'HYBRID',
|
|
'time_ms' => $elapsed,
|
|
];
|
|
|
|
// Save to Redis cache
|
|
$redis->set($cacheKey, $result);
|
|
|
|
return $result;
|
|
}
|
|
|
|
/**
|
|
* Build category JOIN clause
|
|
*/
|
|
private function buildCategoryJoin(array $category_ids): string
|
|
{
|
|
if (empty($category_ids)) {
|
|
return '';
|
|
}
|
|
|
|
return " INNER JOIN {$this->term_rel_table} tr ON p.ID = tr.object_id
|
|
INNER JOIN {$this->term_tax_table} tt ON tr.term_taxonomy_id = tt.term_taxonomy_id
|
|
AND tt.taxonomy = 'category' ";
|
|
}
|
|
|
|
/**
|
|
* Build category WHERE clause
|
|
*/
|
|
private function buildCategoryWhere(array $category_ids, array &$params): string
|
|
{
|
|
if (empty($category_ids)) {
|
|
return '';
|
|
}
|
|
|
|
$placeholders = [];
|
|
foreach ($category_ids as $i => $cat_id) {
|
|
$key = ":cat_{$i}";
|
|
$placeholders[] = $key;
|
|
$params[$key] = $cat_id;
|
|
}
|
|
|
|
return ' AND tt.term_id IN (' . implode(',', $placeholders) . ')';
|
|
}
|
|
|
|
/**
|
|
* Fetch exact matches
|
|
*/
|
|
private function fetchEquals(string $term, int $limit, array $category_ids): array
|
|
{
|
|
$params = [':t' => $term, ':lim' => $limit];
|
|
$catJoin = $this->buildCategoryJoin($category_ids);
|
|
$catWhere = $this->buildCategoryWhere($category_ids, $params);
|
|
|
|
$sql = "SELECT DISTINCT p.ID, p.post_title, p.post_date, p.post_name
|
|
FROM {$this->posts_table} p
|
|
{$catJoin}
|
|
WHERE p.post_type = 'post' AND p.post_status = 'publish'
|
|
AND p.post_title COLLATE utf8mb4_general_ci = :t
|
|
{$catWhere}
|
|
ORDER BY p.post_date DESC
|
|
LIMIT :lim";
|
|
|
|
$st = $this->pdo->prepare($sql);
|
|
foreach ($params as $key => $val) {
|
|
$st->bindValue($key, $val, is_int($val) ? PDO::PARAM_INT : PDO::PARAM_STR);
|
|
}
|
|
$st->execute();
|
|
return $st->fetchAll();
|
|
}
|
|
|
|
/**
|
|
* Fetch starts with matches
|
|
*/
|
|
private function fetchStartsWith(string $term, int $limit, array $category_ids): array
|
|
{
|
|
$prefix = str_replace(['\\', '%', '_'], ['\\\\', '\%', '\_'], $term) . '%';
|
|
$params = [':p' => $prefix, ':lim' => $limit];
|
|
$catJoin = $this->buildCategoryJoin($category_ids);
|
|
$catWhere = $this->buildCategoryWhere($category_ids, $params);
|
|
|
|
$sql = "SELECT DISTINCT p.ID, p.post_title, p.post_date, p.post_name
|
|
FROM {$this->posts_table} p
|
|
{$catJoin}
|
|
WHERE p.post_type = 'post' AND p.post_status = 'publish'
|
|
AND p.post_title LIKE :p ESCAPE '\\\\'
|
|
{$catWhere}
|
|
ORDER BY p.post_date DESC
|
|
LIMIT :lim";
|
|
|
|
$st = $this->pdo->prepare($sql);
|
|
foreach ($params as $key => $val) {
|
|
$st->bindValue($key, $val, is_int($val) ? PDO::PARAM_INT : PDO::PARAM_STR);
|
|
}
|
|
$st->execute();
|
|
return $st->fetchAll();
|
|
}
|
|
|
|
/**
|
|
* Fetch FULLTEXT matches on title
|
|
*/
|
|
private function fetchFulltextTitle(string $term, int $limit, array $category_ids): array
|
|
{
|
|
$q = self::booleanQuery($term);
|
|
if ($q === '') {
|
|
return [];
|
|
}
|
|
|
|
$params = [':q' => $q, ':lim' => $limit];
|
|
$catJoin = $this->buildCategoryJoin($category_ids);
|
|
$catWhere = $this->buildCategoryWhere($category_ids, $params);
|
|
|
|
$sql = "SELECT DISTINCT p.ID, p.post_title, p.post_date, p.post_name,
|
|
MATCH(p.post_title) AGAINST (:q IN BOOLEAN MODE) AS raw_rel
|
|
FROM {$this->posts_table} p
|
|
{$catJoin}
|
|
WHERE p.post_type = 'post' AND p.post_status = 'publish'
|
|
AND MATCH(p.post_title) AGAINST (:q IN BOOLEAN MODE)
|
|
{$catWhere}
|
|
ORDER BY raw_rel DESC, p.post_date DESC
|
|
LIMIT :lim";
|
|
|
|
$st = $this->pdo->prepare($sql);
|
|
foreach ($params as $key => $val) {
|
|
$st->bindValue($key, $val, is_int($val) ? PDO::PARAM_INT : PDO::PARAM_STR);
|
|
}
|
|
$st->execute();
|
|
return $st->fetchAll();
|
|
}
|
|
|
|
/**
|
|
* Fetch all tokens with LIKE (AND)
|
|
*/
|
|
private function fetchAllTokensLike(array $tokens, int $limit, array $category_ids): array
|
|
{
|
|
if (empty($tokens)) {
|
|
return [];
|
|
}
|
|
|
|
$params = [':lim' => $limit];
|
|
$likeConds = [];
|
|
foreach ($tokens as $i => $t) {
|
|
$key = ":lk{$i}";
|
|
$likeConds[] = "p.post_title LIKE {$key} ESCAPE '\\\\'";
|
|
$params[$key] = '%' . str_replace(['\\', '%', '_'], ['\\\\', '\%', '\_'], $t) . '%';
|
|
}
|
|
|
|
$catJoin = $this->buildCategoryJoin($category_ids);
|
|
$catWhere = $this->buildCategoryWhere($category_ids, $params);
|
|
|
|
$where = implode(' AND ', $likeConds);
|
|
$sql = "SELECT DISTINCT p.ID, p.post_title, p.post_date, p.post_name
|
|
FROM {$this->posts_table} p
|
|
{$catJoin}
|
|
WHERE p.post_type = 'post' AND p.post_status = 'publish' AND {$where}
|
|
{$catWhere}
|
|
ORDER BY p.post_date DESC
|
|
LIMIT :lim";
|
|
|
|
$st = $this->pdo->prepare($sql);
|
|
foreach ($params as $key => $val) {
|
|
$st->bindValue($key, $val, is_int($val) ? PDO::PARAM_INT : PDO::PARAM_STR);
|
|
}
|
|
$st->execute();
|
|
return $st->fetchAll();
|
|
}
|
|
|
|
/**
|
|
* Fetch contains matches
|
|
*/
|
|
private function fetchContains(string $term, int $limit, array $category_ids): array
|
|
{
|
|
$like = '%' . str_replace(['\\', '%', '_'], ['\\\\', '\%', '\_'], $term) . '%';
|
|
$params = [':l' => $like, ':lim' => $limit];
|
|
$catJoin = $this->buildCategoryJoin($category_ids);
|
|
$catWhere = $this->buildCategoryWhere($category_ids, $params);
|
|
|
|
$sql = "SELECT DISTINCT p.ID, p.post_title, p.post_date, p.post_name
|
|
FROM {$this->posts_table} p
|
|
{$catJoin}
|
|
WHERE p.post_type = 'post' AND p.post_status = 'publish'
|
|
AND p.post_title LIKE :l ESCAPE '\\\\'
|
|
{$catWhere}
|
|
ORDER BY p.post_date DESC
|
|
LIMIT :lim";
|
|
|
|
$st = $this->pdo->prepare($sql);
|
|
foreach ($params as $key => $val) {
|
|
$st->bindValue($key, $val, is_int($val) ? PDO::PARAM_INT : PDO::PARAM_STR);
|
|
}
|
|
$st->execute();
|
|
return $st->fetchAll();
|
|
}
|
|
|
|
// ==================== Text/Token Utilities ====================
|
|
|
|
private static function asciiFold(string $s): string
|
|
{
|
|
$s = mb_strtolower($s, 'UTF-8');
|
|
$x = @iconv('UTF-8', 'ASCII//TRANSLIT//IGNORE', $s);
|
|
if ($x !== false) {
|
|
$s = $x;
|
|
}
|
|
$s = preg_replace('/[^a-z0-9 ]+/i', ' ', $s);
|
|
$s = preg_replace('/\s+/', ' ', trim($s));
|
|
return $s;
|
|
}
|
|
|
|
private static function normTitle(string $title): string
|
|
{
|
|
return substr(self::asciiFold($title), 0, 160);
|
|
}
|
|
|
|
private static function tokens(string $term): array
|
|
{
|
|
$t = self::asciiFold($term);
|
|
$raw = array_values(array_filter(explode(' ', $t), fn($x) => $x !== ''));
|
|
|
|
$keep1 = ['f', 'x'];
|
|
$parts = [];
|
|
foreach ($raw as $x) {
|
|
if (ctype_digit($x)) {
|
|
$parts[] = $x;
|
|
} elseif (strlen($x) >= 2) {
|
|
$parts[] = $x;
|
|
} elseif (in_array($x, $keep1, true)) {
|
|
$parts[] = $x;
|
|
}
|
|
}
|
|
|
|
// Dedupe preserving order
|
|
$seen = [];
|
|
$out = [];
|
|
foreach ($parts as $p) {
|
|
if (!isset($seen[$p])) {
|
|
$seen[$p] = true;
|
|
$out[] = $p;
|
|
}
|
|
}
|
|
return $out;
|
|
}
|
|
|
|
private static function booleanQuery(string $input): string
|
|
{
|
|
$input = preg_replace("/[^\\p{L}\\p{N}\\s\"'\\+\\-\\*]/u", ' ', trim($input));
|
|
$len = mb_strlen($input, 'UTF-8');
|
|
$buf = '';
|
|
$inQ = false;
|
|
$out = [];
|
|
|
|
for ($i = 0; $i < $len; $i++) {
|
|
$ch = mb_substr($input, $i, 1, 'UTF-8');
|
|
if ($ch === '"') {
|
|
if ($inQ) {
|
|
$buf .= $ch;
|
|
if ($buf !== '""') {
|
|
$out[] = $buf;
|
|
}
|
|
$buf = '';
|
|
$inQ = false;
|
|
} else {
|
|
if ($buf !== '') {
|
|
$out[] = $buf;
|
|
$buf = '';
|
|
}
|
|
$buf = '"';
|
|
$inQ = true;
|
|
}
|
|
} elseif (preg_match('/\s/u', $ch)) {
|
|
if ($inQ) {
|
|
$buf .= $ch;
|
|
} else {
|
|
if ($buf !== '') {
|
|
$out[] = $buf;
|
|
$buf = '';
|
|
}
|
|
}
|
|
} else {
|
|
$buf .= $ch;
|
|
}
|
|
}
|
|
if ($buf !== '') {
|
|
$out[] = $inQ ? ($buf . '"') : $buf;
|
|
}
|
|
|
|
$parts = [];
|
|
foreach ($out as $tok) {
|
|
if ($tok === '' || mb_strlen($tok, 'UTF-8') < 2) {
|
|
continue;
|
|
}
|
|
$U = strtoupper($tok);
|
|
if ($tok[0] === '"' && substr($tok, -1) === '"') {
|
|
$parts[] = '+' . $tok;
|
|
} elseif (in_array($U, ['AND', 'OR', 'NOT'], true)) {
|
|
$parts[] = $U;
|
|
} else {
|
|
$parts[] = '+' . $tok . '*';
|
|
}
|
|
}
|
|
return implode(' ', $parts);
|
|
}
|
|
|
|
// ==================== Scoring Functions ====================
|
|
|
|
private static function coverageBoost(string $title, array $tokens): float
|
|
{
|
|
if (empty($tokens)) {
|
|
return 0.0;
|
|
}
|
|
$t = self::asciiFold($title);
|
|
$hit = 0;
|
|
foreach ($tokens as $tok) {
|
|
if ($tok !== '' && strpos($t, $tok) !== false) {
|
|
$hit++;
|
|
}
|
|
}
|
|
return ($hit / max(1, count($tokens))) * self::W_COVERAGE;
|
|
}
|
|
|
|
private static function requiredTokensPenalty(string $title, array $tokens): float
|
|
{
|
|
$n = count($tokens);
|
|
if ($n === 0 || $n > 4) {
|
|
return 0.0;
|
|
}
|
|
|
|
$t = self::asciiFold($title);
|
|
$hit = 0;
|
|
foreach ($tokens as $tok) {
|
|
if ($tok !== '' && strpos($t, $tok) !== false) {
|
|
$hit++;
|
|
}
|
|
}
|
|
$miss = $n - $hit;
|
|
if ($miss <= 0) {
|
|
return 0.0;
|
|
}
|
|
|
|
return -(self::REQ_BASE_PENALTY + self::REQ_MISS_PER_TOKEN * $miss);
|
|
}
|
|
|
|
private static function startsWithBoost(string $title, string $term): float
|
|
{
|
|
$a = self::asciiFold($title);
|
|
$b = self::asciiFold($term);
|
|
return str_starts_with($a, $b) ? self::W_STARTSWITH : 0.0;
|
|
}
|
|
|
|
private static function wordExactBoost(string $title, string $term): float
|
|
{
|
|
$a = ' ' . self::asciiFold($title) . ' ';
|
|
$b = self::asciiFold($term);
|
|
if ($b === '') {
|
|
return 0.0;
|
|
}
|
|
return preg_match('/\b' . preg_quote($b, '/') . '\b/u', $a) ? self::W_WORD_EXACT : 0.0;
|
|
}
|
|
|
|
private static function recencyBoost(string $date): float
|
|
{
|
|
$d = strtotime($date);
|
|
if (!$d) {
|
|
return 0.0;
|
|
}
|
|
$days = max(1, (time() - $d) / 86400);
|
|
return self::W_RECENCY_MAX / (1.0 + $days / 180.0);
|
|
}
|
|
|
|
private static function levenshteinSimilarity(string $a, string $b): float
|
|
{
|
|
$aa = substr(self::asciiFold($a), 0, 80);
|
|
$bb = substr(self::asciiFold($b), 0, 80);
|
|
if ($aa === '' || $bb === '') {
|
|
return 0.0;
|
|
}
|
|
$dist = levenshtein($aa, $bb);
|
|
$max = max(strlen($aa), strlen($bb));
|
|
return $max > 0 ? max(0.0, 1.0 - ($dist / $max)) : 0.0;
|
|
}
|
|
|
|
private static function tokenFuzzyBoost(string $title, array $tokens): float
|
|
{
|
|
if (empty($tokens)) {
|
|
return 0.0;
|
|
}
|
|
$tw = array_slice(preg_split('/\s+/', self::asciiFold($title)), 0, 12);
|
|
if (empty($tw)) {
|
|
return 0.0;
|
|
}
|
|
$best = 0.0;
|
|
foreach ($tokens as $tok) {
|
|
$tokA = self::asciiFold($tok);
|
|
foreach ($tw as $w) {
|
|
if ($w === '' || $tokA === '') {
|
|
continue;
|
|
}
|
|
$max = max(strlen($tokA), strlen($w));
|
|
if ($max === 0) {
|
|
continue;
|
|
}
|
|
$sim = 1.0 - (levenshtein($tokA, $w) / $max);
|
|
if ($sim > $best) {
|
|
$best = $sim;
|
|
}
|
|
}
|
|
}
|
|
return max(0.0, $best) * self::W_FUZZY_TOKEN_MAX;
|
|
}
|
|
|
|
private static function findPositions(string $foldedTitle, string $token): array
|
|
{
|
|
$T = $foldedTitle;
|
|
$occ = [];
|
|
$lenT = strlen($T);
|
|
$lenK = strlen($token);
|
|
if ($lenK === 0) {
|
|
return $occ;
|
|
}
|
|
|
|
$hasLetter = (bool) preg_match('/[a-z]/', $token);
|
|
$hasDigit = (bool) preg_match('/[0-9]/', $token);
|
|
$isMixed = $hasLetter && $hasDigit;
|
|
|
|
$pos = 0;
|
|
while (true) {
|
|
$p = strpos($T, $token, $pos);
|
|
if ($p === false) {
|
|
break;
|
|
}
|
|
|
|
$left = ($p > 0) ? $T[$p - 1] : ' ';
|
|
$right = ($p + $lenK < $lenT) ? $T[$p + $lenK] : ' ';
|
|
|
|
$leftOk = !ctype_alnum($left);
|
|
$rightOk = $isMixed ? true : !ctype_alnum($right);
|
|
|
|
if ($leftOk && $rightOk) {
|
|
$occ[] = [$p, $p + $lenK];
|
|
}
|
|
$pos = $p + 1;
|
|
}
|
|
return $occ;
|
|
}
|
|
|
|
private static function proximityBoost(string $title, array $tokens): float
|
|
{
|
|
$tokens = array_values(array_unique(array_filter($tokens, fn($t) => $t !== '')));
|
|
if (count($tokens) < 2) {
|
|
return 0.0;
|
|
}
|
|
|
|
$T = self::asciiFold($title);
|
|
$occ = [];
|
|
foreach ($tokens as $tok) {
|
|
foreach (self::findPositions($T, $tok) as $p) {
|
|
$occ[] = ['pos' => $p[0], 'end' => $p[1], 'tok' => $tok];
|
|
}
|
|
}
|
|
if (empty($occ)) {
|
|
return 0.0;
|
|
}
|
|
usort($occ, fn($a, $b) => $a['pos'] <=> $b['pos']);
|
|
|
|
$present = [];
|
|
foreach ($occ as $o) {
|
|
$present[$o['tok']] = true;
|
|
}
|
|
$needCount = count($present);
|
|
if ($needCount < 2) {
|
|
return 0.0;
|
|
}
|
|
|
|
$cnt = [];
|
|
$covered = 0;
|
|
$bestSpan = PHP_INT_MAX;
|
|
for ($r = 0, $l = 0; $r < count($occ); $r++) {
|
|
$t = $occ[$r]['tok'];
|
|
$cnt[$t] = ($cnt[$t] ?? 0) + 1;
|
|
if ($cnt[$t] === 1) {
|
|
$covered++;
|
|
}
|
|
while ($covered === $needCount && $l <= $r) {
|
|
$span = $occ[$r]['end'] - $occ[$l]['pos'];
|
|
if ($span < $bestSpan) {
|
|
$bestSpan = $span;
|
|
}
|
|
$lt = $occ[$l]['tok'];
|
|
$cnt[$lt]--;
|
|
if ($cnt[$lt] === 0) {
|
|
$covered--;
|
|
}
|
|
$l++;
|
|
}
|
|
}
|
|
if ($bestSpan === PHP_INT_MAX) {
|
|
return 0.0;
|
|
}
|
|
|
|
$compact = $needCount / max(1, $bestSpan);
|
|
return self::W_PROX_CHARS * $compact;
|
|
}
|
|
|
|
private static function orderedWindowBoost(string $title, array $tokens): float
|
|
{
|
|
$tokens = array_values(array_unique(array_filter($tokens, fn($t) => $t !== '')));
|
|
if (count($tokens) < 2) {
|
|
return 0.0;
|
|
}
|
|
|
|
$T = self::asciiFold($title);
|
|
$posList = [];
|
|
foreach ($tokens as $t) {
|
|
$posList[$t] = self::findPositions($T, $t);
|
|
if (empty($posList[$t])) {
|
|
return 0.0;
|
|
}
|
|
}
|
|
|
|
$bestSpanChars = PHP_INT_MAX;
|
|
$bestStart = -1;
|
|
$t0 = $tokens[0];
|
|
|
|
foreach ($posList[$t0] as $p0) {
|
|
$start = $p0[0];
|
|
$end = $p0[1];
|
|
$ok = true;
|
|
$cursor = $end;
|
|
for ($i = 1; $i < count($tokens); $i++) {
|
|
$tok = $tokens[$i];
|
|
$found = false;
|
|
foreach ($posList[$tok] as $pp) {
|
|
if ($pp[0] >= $cursor) {
|
|
$end = max($end, $pp[1]);
|
|
$cursor = $pp[1];
|
|
$found = true;
|
|
break;
|
|
}
|
|
}
|
|
if (!$found) {
|
|
$ok = false;
|
|
break;
|
|
}
|
|
}
|
|
if ($ok) {
|
|
$span = $end - $start;
|
|
if ($span < $bestSpanChars) {
|
|
$bestSpanChars = $span;
|
|
$bestStart = $start;
|
|
}
|
|
}
|
|
}
|
|
|
|
if ($bestSpanChars === PHP_INT_MAX) {
|
|
return 0.0;
|
|
}
|
|
|
|
$slice = substr($T, max(0, $bestStart), max(1, $bestSpanChars));
|
|
$wordsInSpan = max(1, count(array_filter(explode(' ', $slice))));
|
|
$tightness = count($tokens) / $wordsInSpan;
|
|
|
|
$score = self::W_ORDERED_WINDOW * $tightness;
|
|
if ($bestStart <= 6) {
|
|
$score += self::W_ORDERED_ANCHOR;
|
|
}
|
|
|
|
return $score;
|
|
}
|
|
|
|
private static function lengthPenalty(string $title): float
|
|
{
|
|
$len = mb_strlen($title, 'UTF-8');
|
|
if ($len <= self::LEN_PEN_START) {
|
|
return 0.0;
|
|
}
|
|
$extra = $len - self::LEN_PEN_START;
|
|
return -min(300.0, $extra * self::LEN_PEN_PER_CHAR);
|
|
}
|
|
|
|
// ==================== URL Helpers ====================
|
|
|
|
/**
|
|
* Obtiene la URL del sitio desde wp_options
|
|
*/
|
|
private function getSiteUrlFromDb(): string
|
|
{
|
|
static $cached = null;
|
|
if ($cached !== null) {
|
|
return $cached;
|
|
}
|
|
|
|
$stmt = $this->pdo->prepare(
|
|
"SELECT option_value FROM {$this->prefix}options
|
|
WHERE option_name = 'home' LIMIT 1"
|
|
);
|
|
$stmt->execute();
|
|
$result = $stmt->fetch(\PDO::FETCH_ASSOC);
|
|
$cached = $result ? rtrim($result['option_value'], '/') : '';
|
|
return $cached;
|
|
}
|
|
|
|
/**
|
|
* Obtiene la estructura de permalinks desde wp_options
|
|
*/
|
|
private function getPermalinkStructure(): string
|
|
{
|
|
static $cached = null;
|
|
if ($cached !== null) {
|
|
return $cached;
|
|
}
|
|
|
|
$stmt = $this->pdo->prepare(
|
|
"SELECT option_value FROM {$this->prefix}options
|
|
WHERE option_name = 'permalink_structure' LIMIT 1"
|
|
);
|
|
$stmt->execute();
|
|
$result = $stmt->fetch(\PDO::FETCH_ASSOC);
|
|
$cached = $result ? $result['option_value'] : '';
|
|
return $cached;
|
|
}
|
|
|
|
/**
|
|
* Construye permalink desde post_name
|
|
* Maneja diferentes estructuras de permalinks
|
|
*/
|
|
public function buildPermalink(int $postId, string $postName): string
|
|
{
|
|
// Fallback si post_name está vacío
|
|
if (empty($postName)) {
|
|
$siteUrl = $this->getSiteUrlFromDb();
|
|
return $siteUrl . '/?p=' . $postId;
|
|
}
|
|
|
|
$siteUrl = $this->getSiteUrlFromDb();
|
|
$structure = $this->getPermalinkStructure();
|
|
|
|
// Si estructura contiene %post_id%, usar ID
|
|
if (strpos($structure, '%post_id%') !== false) {
|
|
return $siteUrl . '/' . $postId . '/';
|
|
}
|
|
|
|
// Si estructura contiene %postname%, usar post_name
|
|
if (strpos($structure, '%postname%') !== false) {
|
|
return $siteUrl . '/' . $postName . '/';
|
|
}
|
|
|
|
// Fallback: usar post_name (estructura más común)
|
|
return $siteUrl . '/' . $postName . '/';
|
|
}
|
|
}
|