Files
roi-theme/buscar-apus/app/Search.php
root a22573bf0b Commit inicial - WordPress Análisis de Precios Unitarios
- WordPress core y plugins
- Tema Twenty Twenty-Four configurado
- Plugin allow-unfiltered-html.php simplificado
- .gitignore configurado para excluir wp-config.php y uploads

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-03 21:04:30 -06:00

526 lines
19 KiB
PHP
Executable File

<?php
// public_html/buscar-apus/app/Search.php
final class Search {
private PDO $pdo;
private string $table;
private bool $logEnabled;
/* ===== Pesos ajustables ===== */
private const RAW_REL_MULT = 40.0; // señal FULLTEXT (suave)
private const W_COVERAGE = 200.0; // cobertura de tokens
private const W_STARTSWITH = 240.0; // empieza por…
private const W_WORD_EXACT = 140.0; // palabra exacta
private const W_FUZZY_TOKEN_MAX = 120.0; // similitud por token
private const W_RECENCY_MAX = 120.0; // recencia (máx aprox)
private const W_PROX_CHARS = 620.0; // proximidad por tramo en caracteres
private const W_ORDERED_WINDOW = 1600.0; // **principal**: tokens en orden y cerca
private const W_ORDERED_ANCHOR = 300.0; // bonus si arranca ~al inicio
private const LEN_PEN_START = 180; // penaliza longitud desde aquí
private const LEN_PEN_PER_CHAR = 0.55;
// penalización cuando faltan tokens en consultas cortas (≤4 tokens)
private const REQ_MISS_PER_TOKEN = 420.0; // por token faltante
private const REQ_BASE_PENALTY = 140.0; // castigo base si falta al menos uno
public function __construct(PDO $pdo, string $prefix, bool $logEnabled) {
$this->pdo = $pdo;
$this->table = "{$prefix}posts";
$this->logEnabled = $logEnabled;
}
/** --- Sanitización del término --- */
public static function sanitizeTerm(string $raw, int $min, int $max): array {
$term = trim($raw);
if ($term === '') return [false, $term, 'Ingresa un término'];
if (mb_strlen($term, 'UTF-8') < $min) return [false, $term, "Mínimo {$min} caracteres"];
if (mb_strlen($term, 'UTF-8') > $max) return [false, mb_substr($term, 0, $max, 'UTF-8'), "Máximo {$max} caracteres"];
return [true, $term, null];
}
/** --- Utilidades texto/tokens/normalización --- */
private static function asciiFold(string $s): string {
$s = mb_strtolower($s, 'UTF-8');
$x = @iconv('UTF-8', 'ASCII//TRANSLIT//IGNORE', $s);
if ($x !== false) $s = $x;
$s = preg_replace('/[^a-z0-9 ]+/i', ' ', $s);
$s = preg_replace('/\s+/', ' ', trim($s));
return $s;
}
private static function normTitle(string $title): string {
return substr(self::asciiFold($title), 0, 160);
}
/**
* tokens(): conserva:
* - tokens de 2+ caracteres
* - números (siempre)
* - tokens de 1 carácter en whitelist, p.ej. "f" (muy relevante para f'c)
*/
private static function tokens(string $term): array {
$t = self::asciiFold($term);
$raw = array_values(array_filter(explode(' ', $t), fn($x)=>$x!==''));
$keep1 = ['f','x']; // amplia si lo necesitas (p.ej. 'h' para secciones, etc.)
$parts = [];
foreach ($raw as $x) {
if (ctype_digit($x)) { // números: 10, 250, 3/4 no entra aquí pero 250 sí
$parts[] = $x;
} elseif (strlen($x) >= 2) {
$parts[] = $x;
} elseif (in_array($x, $keep1, true)) {
$parts[] = $x;
}
}
// dedup orden estable
$seen = [];
$out = [];
foreach ($parts as $p) {
if (!isset($seen[$p])) { $seen[$p]=true; $out[]=$p; }
}
return $out;
}
/** --- FULLTEXT boolean query (solo título) --- */
private static function booleanQuery(string $input): string {
$input = preg_replace("/[^\\p{L}\\p{N}\\s\"'\\+\\-\\*]/u", ' ', trim($input));
$len = mb_strlen($input, 'UTF-8'); $buf=''; $inQ=false; $out=[];
for ($i=0;$i<$len;$i++){
$ch = mb_substr($input,$i,1,'UTF-8');
if ($ch === '"'){
if ($inQ){ $buf.=$ch; if($buf!=='""') $out[] = $buf; $buf=''; $inQ=false; }
else { if ($buf!==''){ $out[]=$buf; $buf=''; } $buf='"'; $inQ=true; }
} elseif (preg_match('/\s/u',$ch)) {
if ($inQ) $buf.=$ch; else { if($buf!==''){ $out[]=$buf; $buf=''; } }
} else { $buf.=$ch; }
}
if ($buf!=='') $out[] = $inQ ? ($buf.'"') : $buf;
$parts=[];
foreach ($out as $tok){
if ($tok==='' || mb_strlen($tok,'UTF-8')<2) continue;
$U = strtoupper($tok);
if ($tok[0]==='"' && substr($tok,-1)==='"') $parts[] = '+' . $tok;
elseif (in_array($U, ['AND','OR','NOT'], true)) $parts[] = $U;
else $parts[] = '+' . $tok . '*';
}
return implode(' ', $parts);
}
/** --- Señales/boosts --- */
private static function coverageBoost(string $title, array $tokens): float {
if (!$tokens) return 0.0;
$t = self::asciiFold($title);
$hit = 0;
foreach ($tokens as $tok) {
if ($tok !== '' && strpos($t, $tok) !== false) $hit++;
}
return ($hit / max(1, count($tokens))) * self::W_COVERAGE;
}
/**
* Penaliza faltantes cuando la consulta es corta (≤4 tokens).
* Ej.: "concreto f 250" -> si falta "f" en el título: castigo fuerte.
*/
private static function requiredTokensPenalty(string $title, array $tokens): float {
$n = count($tokens);
if ($n === 0 || $n > 4) return 0.0;
$t = self::asciiFold($title);
$hit = 0;
foreach ($tokens as $tok) {
if ($tok !== '' && strpos($t, $tok) !== false) $hit++;
}
$miss = $n - $hit;
if ($miss <= 0) return 0.0;
return - (self::REQ_BASE_PENALTY + self::REQ_MISS_PER_TOKEN * $miss);
}
private static function startsWithBoost(string $title, string $term): float {
$a = self::asciiFold($title);
$b = self::asciiFold($term);
return str_starts_with($a, $b) ? self::W_STARTSWITH : 0.0;
}
private static function wordExactBoost(string $title, string $term): float {
$a = ' ' . self::asciiFold($title) . ' ';
$b = self::asciiFold($term);
if ($b === '') return 0.0;
return preg_match('/\b' . preg_quote($b, '/') . '\b/u', $a) ? self::W_WORD_EXACT : 0.0;
}
private static function recencyBoost(string $date): float {
$d = strtotime($date);
if (!$d) return 0.0;
$days = max(1, (time() - $d) / 86400);
return self::W_RECENCY_MAX / (1.0 + $days / 180.0);
}
private static function levenshteinSimilarity(string $a, string $b): float {
$aa = substr(self::asciiFold($a), 0, 80);
$bb = substr(self::asciiFold($b), 0, 80);
if ($aa === '' || $bb === '') return 0.0;
$dist = levenshtein($aa, $bb);
$max = max(strlen($aa), strlen($bb));
return $max > 0 ? max(0.0, 1.0 - ($dist / $max)) : 0.0;
}
private static function tokenFuzzyBoost(string $title, array $tokens): float {
if (!$tokens) return 0.0;
$tw = array_slice(preg_split('/\s+/', self::asciiFold($title)), 0, 12);
if (!$tw) return 0.0;
$best = 0.0;
foreach ($tokens as $tok) {
$tokA = self::asciiFold($tok);
foreach ($tw as $w) {
if ($w === '' || $tokA === '') continue;
$max = max(strlen($tokA), strlen($w));
if ($max === 0) continue;
$sim = 1.0 - (levenshtein($tokA, $w) / $max);
if ($sim > $best) $best = $sim;
}
}
return max(0.0, $best) * self::W_FUZZY_TOKEN_MAX;
}
/* === Posiciones con control de bordes:
- numérico puro: requiere no-alfaNum a ambos lados
- alfabético puro: requiere no-alfaNum a ambos lados
- mixto (letras+nums, ej. "15x15"): sólo exige borde izquierdo no-alfaNum,
y RELAJA el derecho para permitir sufijos ("15x15cm"). */
private static function findPositions(string $foldedTitle, string $token): array {
$T = $foldedTitle;
$occ = [];
$lenT = strlen($T);
$lenK = strlen($token);
if ($lenK === 0) return $occ;
$hasLetter = (bool)preg_match('/[a-z]/', $token);
$hasDigit = (bool)preg_match('/[0-9]/', $token);
$isMixed = $hasLetter && $hasDigit; // p.ej. "15x15"
$pureNum = !$hasLetter && $hasDigit; // p.ej. "10"
$pureAlpha = $hasLetter && !$hasDigit; // p.ej. "dala"
$pos = 0;
while (true) {
$p = strpos($T, $token, $pos);
if ($p === false) break;
$left = ($p > 0) ? $T[$p-1] : ' ';
$right = ($p+$lenK < $lenT) ? $T[$p+$lenK] : ' ';
$leftOk = !ctype_alnum($left); // siempre exigimos borde izq.
$rightOk = $isMixed ? true : !ctype_alnum($right); // relajado sólo para mixtos
if ($leftOk && $rightOk) {
$occ[] = [$p, $p+$lenK];
}
$pos = $p + 1;
}
return $occ;
}
/** ===== Proximidad por tramo en caracteres ===== */
private static function proximityBoost(string $title, array $tokens): float {
$tokens = array_values(array_unique(array_filter($tokens, fn($t)=>$t !== '')));
if (count($tokens) < 2) return 0.0;
$T = self::asciiFold($title);
$occ = [];
foreach ($tokens as $tok) {
foreach (self::findPositions($T, $tok) as $p) {
$occ[] = ['pos'=>$p[0], 'end'=>$p[1], 'tok'=>$tok];
}
}
if (!$occ) return 0.0;
usort($occ, fn($a,$b)=> $a['pos'] <=> $b['pos']);
$present = [];
foreach ($occ as $o) $present[$o['tok']] = true;
$needCount = count($present);
if ($needCount < 2) return 0.0;
$cnt = []; $covered = 0;
$bestSpan = PHP_INT_MAX; $bestL = 0; $bestR = -1;
for ($r=0,$l=0; $r<count($occ); $r++) {
$t = $occ[$r]['tok'];
$cnt[$t] = ($cnt[$t] ?? 0) + 1;
if ($cnt[$t] === 1) $covered++;
while ($covered === $needCount && $l <= $r) {
$span = $occ[$r]['end'] - $occ[$l]['pos'];
if ($span < $bestSpan) { $bestSpan = $span; $bestL = $l; $bestR = $r; }
$lt = $occ[$l]['tok'];
$cnt[$lt]--;
if ($cnt[$lt] === 0) $covered--;
$l++;
}
}
if ($bestSpan === PHP_INT_MAX) return 0.0;
$compact = $needCount / max(1, $bestSpan);
return self::W_PROX_CHARS * $compact;
}
/** ===== Ventana ORDENADA (muy prioritaria) ===== */
private static function orderedWindowBoost(string $title, array $tokens): float {
$tokens = array_values(array_unique(array_filter($tokens, fn($t)=>$t !== '')));
if (count($tokens) < 2) return 0.0;
$T = self::asciiFold($title);
$posList = [];
foreach ($tokens as $t) {
$posList[$t] = self::findPositions($T, $t);
if (!$posList[$t]) return 0.0; // si falta un token, no hay ventana ordenada
}
$bestSpanChars = PHP_INT_MAX;
$bestStart = -1;
$t0 = $tokens[0];
foreach ($posList[$t0] as $p0) {
$start = $p0[0]; $end = $p0[1];
$ok = true; $cursor = $end;
for ($i=1; $i<count($tokens); $i++) {
$tok = $tokens[$i];
$found = false;
foreach ($posList[$tok] as $pp) {
if ($pp[0] >= $cursor) {
$end = max($end, $pp[1]);
$cursor = $pp[1];
$found = true; break;
}
}
if (!$found) { $ok = false; break; }
}
if ($ok) {
$span = $end - $start;
if ($span < $bestSpanChars) { $bestSpanChars = $span; $bestStart = $start; }
}
}
if ($bestSpanChars === PHP_INT_MAX) return 0.0;
$slice = substr($T, max(0,$bestStart), max(1,$bestSpanChars));
$wordsInSpan = max(1, count(array_filter(explode(' ', $slice))));
$tightness = count($tokens) / $wordsInSpan;
$score = self::W_ORDERED_WINDOW * $tightness;
if ($bestStart <= 6) $score += self::W_ORDERED_ANCHOR;
return $score;
}
/** Penalización por títulos excesivamente largos. */
private static function lengthPenalty(string $title): float {
$len = mb_strlen($title, 'UTF-8');
if ($len <= self::LEN_PEN_START) return 0.0;
$extra = $len - self::LEN_PEN_START;
return - min(300.0, $extra * self::LEN_PEN_PER_CHAR);
}
/** --- Buckets de obtención (solo título) --- */
private function fetchEquals(string $term, int $limit): array {
$sql = "SELECT ID, post_title, post_date
FROM {$this->table}
WHERE post_type='post' AND post_status='publish'
AND post_title COLLATE utf8mb4_general_ci = :t
ORDER BY post_date DESC
LIMIT :lim";
$st = $this->pdo->prepare($sql);
$st->bindValue(':t',$term);
$st->bindValue(':lim',$limit,PDO::PARAM_INT);
$st->execute();
$rows = $st->fetchAll();
Logger::log($this->logEnabled,'bucket_equals',['count'=>count($rows)]);
return $rows;
}
private function fetchStartsWith(string $term, int $limit): array {
$sql = "SELECT ID, post_title, post_date
FROM {$this->table}
WHERE post_type='post' AND post_status='publish'
AND post_title LIKE :p ESCAPE '\\\\'
ORDER BY post_date DESC
LIMIT :lim";
$prefix = str_replace(['\\','%','_'], ['\\\\','\%','\_'], $term) . '%';
$st = $this->pdo->prepare($sql);
$st->bindValue(':p',$prefix);
$st->bindValue(':lim',$limit,PDO::PARAM_INT);
$st->execute();
$rows = $st->fetchAll();
Logger::log($this->logEnabled,'bucket_starts',['count'=>count($rows)]);
return $rows;
}
private function fetchFulltextTitle(string $term, int $limit): array {
$q = self::booleanQuery($term);
if ($q === '') return [];
$sql = "SELECT ID, post_title, post_date,
MATCH(post_title) AGAINST (:q IN BOOLEAN MODE) AS raw_rel
FROM {$this->table}
WHERE post_type='post' AND post_status='publish'
AND MATCH(post_title) AGAINST (:q IN BOOLEAN MODE)
ORDER BY raw_rel DESC, post_date DESC
LIMIT :lim";
$st = $this->pdo->prepare($sql);
$st->bindValue(':q',$q);
$st->bindValue(':lim',$limit,PDO::PARAM_INT);
$st->execute();
$rows = $st->fetchAll();
Logger::log($this->logEnabled,'bucket_fulltext',['q'=>$q,'count'=>count($rows)]);
return $rows;
}
/** TODOS los tokens por LIKE: sube el recall de candidatos buenos. */
private function fetchAllTokensLike(array $tokens, int $limit): array {
if (!$tokens) return [];
$likeConds = [];
foreach ($tokens as $i=>$t) {
$likeConds[] = "post_title LIKE :lk{$i} ESCAPE '\\\\'";
}
$where = implode(' AND ', $likeConds);
$sql = "SELECT ID, post_title, post_date
FROM {$this->table}
WHERE post_type='post' AND post_status='publish' AND {$where}
ORDER BY post_date DESC
LIMIT :lim";
$st = $this->pdo->prepare($sql);
foreach ($tokens as $i=>$t) {
$st->bindValue(":lk{$i}", '%' . str_replace(['\\','%','_'], ['\\\\','\%','\_'], $t) . '%');
}
$st->bindValue(':lim',$limit,PDO::PARAM_INT);
$st->execute();
$rows = $st->fetchAll();
Logger::log($this->logEnabled,'bucket_like_all',['count'=>count($rows)]);
return $rows;
}
private function fetchContains(string $term, int $limit): array {
$like = '%' . str_replace(['\\','%','_'], ['\\\\','\%','\_'], $term) . '%';
$sql = "SELECT ID, post_title, post_date
FROM {$this->table}
WHERE post_type='post' AND post_status='publish'
AND post_title LIKE :l ESCAPE '\\\\'
ORDER BY post_date DESC
LIMIT :lim";
$st = $this->pdo->prepare($sql);
$st->bindValue(':l',$like);
$st->bindValue(':lim',$limit,PDO::PARAM_INT);
$st->execute();
$rows = $st->fetchAll();
Logger::log($this->logEnabled,'bucket_contains',['count'=>count($rows)]);
return $rows;
}
/** --- Merge + dedupe + re-rank --- */
public function run(string $term, int $limit, int $offset): array {
$t0 = microtime(true);
$tokens = self::tokens($term);
// Tamaños de pool
$capFull = max(120, min(300, $limit * 8));
$capLike = max(120, min(300, $limit * 8));
$capPref = max(60, min(200, $limit * 4));
$capEq = min(40, $limit * 2);
$capCont = max(80, min(240, $limit * 6));
$buckets = [
['name'=>'LIKE_ALL', 'base'=>900.0, 'rows'=>$this->fetchAllTokensLike($tokens, $capLike)],
['name'=>'FULLTEXT', 'base'=>700.0, 'rows'=>$this->fetchFulltextTitle($term, $capFull)],
['name'=>'STARTS', 'base'=>650.0, 'rows'=>$this->fetchStartsWith($term, $capPref)],
['name'=>'CONTAINS', 'base'=>500.0, 'rows'=>$this->fetchContains($term, $capCont)],
['name'=>'EQUALS', 'base'=>1200.0, 'rows'=>$this->fetchEquals($term, $capEq)],
];
// Deduplicar por título normalizado
$seen = [];
$pool = [];
foreach ($buckets as $b) {
foreach ($b['rows'] as $r) {
$norm = self::normTitle($r['post_title']);
if (isset($seen[$norm])) continue;
$seen[$norm] = true;
$pool[] = [
'ID' => (int)$r['ID'],
'post_title' => (string)$r['post_title'],
'post_date' => (string)$r['post_date'],
'bucket' => $b['name'],
'baseW' => (float)$b['base'],
'raw_rel' => isset($r['raw_rel']) ? (float)$r['raw_rel'] : 0.0,
];
}
}
$poolTotal = count($pool);
if ($poolTotal === 0) {
$elapsed = round((microtime(true)-$t0)*1000,2);
Logger::log($this->logEnabled,'consulta_ejecutada',[
'modo'=>'HYBRID','total'=>0,'filas'=>0,'tiempo_ms'=>$elapsed,'pool_total'=>0
]);
return ['total'=>0,'rows'=>[],'modo'=>'HYBRID','time_ms'=>$elapsed];
}
// Re-rank
foreach ($pool as &$it) {
$title = $it['post_title'];
$date = $it['post_date'];
$rawRel = $it['raw_rel'];
$baseW = $it['baseW'];
$score = $baseW
+ ($rawRel * self::RAW_REL_MULT)
+ self::coverageBoost($title, $tokens)
+ self::orderedWindowBoost($title, $tokens) // *** manda ***
+ self::proximityBoost($title, $tokens) // cercano (no necesariamente ordenado)
+ self::startsWithBoost($title, $term)
+ self::wordExactBoost($title, $term)
+ (self::levenshteinSimilarity($title, $term) * 160.0)
+ self::tokenFuzzyBoost($title, $tokens)
+ self::recencyBoost($date)
+ self::lengthPenalty($title)
+ self::requiredTokensPenalty($title, $tokens); // *** nuevo castigo si faltan tokens
$it['score'] = $score;
}
unset($it);
usort($pool, function($a,$b){
if ($a['score'] === $b['score']) {
return strcmp($b['post_date'], $a['post_date']);
}
return ($a['score'] < $b['score']) ? 1 : -1;
});
// Paginación
$pageRows = array_slice($pool, $offset, $limit);
$rows = array_map(fn($r)=>[
'ID' => $r['ID'],
'post_title' => $r['post_title'],
'post_date' => $r['post_date'],
// para depurar: 'score'=>$r['score'], 'bucket'=>$r['bucket']
], $pageRows);
$elapsed = round((microtime(true)-$t0)*1000,2);
Logger::log($this->logEnabled,'consulta_ejecutada',[
'modo'=>'HYBRID',
'total'=>$poolTotal,
'filas'=>count($rows),
'tiempo_ms'=>$elapsed,
'pool_total'=>$poolTotal,
'buckets'=>[
'like_all'=>count($buckets[0]['rows']),
'fulltext'=>count($buckets[1]['rows']),
'starts'=>count($buckets[2]['rows']),
'contains'=>count($buckets[3]['rows']),
'equals'=>count($buckets[4]['rows']),
],
]);
return ['total'=>$poolTotal,'rows'=>$rows,'modo'=>'HYBRID','time_ms'=>$elapsed];
}
}