pdo = $pdo; $this->table = "{$prefix}posts"; $this->logEnabled = $logEnabled; } /** --- Sanitización del término --- */ public static function sanitizeTerm(string $raw, int $min, int $max): array { $term = trim($raw); if ($term === '') return [false, $term, 'Ingresa un término']; if (mb_strlen($term, 'UTF-8') < $min) return [false, $term, "Mínimo {$min} caracteres"]; if (mb_strlen($term, 'UTF-8') > $max) return [false, mb_substr($term, 0, $max, 'UTF-8'), "Máximo {$max} caracteres"]; return [true, $term, null]; } /** --- Utilidades texto/tokens/normalización --- */ private static function asciiFold(string $s): string { $s = mb_strtolower($s, 'UTF-8'); $x = @iconv('UTF-8', 'ASCII//TRANSLIT//IGNORE', $s); if ($x !== false) $s = $x; $s = preg_replace('/[^a-z0-9 ]+/i', ' ', $s); $s = preg_replace('/\s+/', ' ', trim($s)); return $s; } private static function normTitle(string $title): string { return substr(self::asciiFold($title), 0, 160); } /** * tokens(): conserva: * - tokens de 2+ caracteres * - números (siempre) * - tokens de 1 carácter en whitelist, p.ej. "f" (muy relevante para f'c) */ private static function tokens(string $term): array { $t = self::asciiFold($term); $raw = array_values(array_filter(explode(' ', $t), fn($x)=>$x!=='')); $keep1 = ['f','x']; // amplia si lo necesitas (p.ej. 'h' para secciones, etc.) $parts = []; foreach ($raw as $x) { if (ctype_digit($x)) { // números: 10, 250, 3/4 no entra aquí pero 250 sí $parts[] = $x; } elseif (strlen($x) >= 2) { $parts[] = $x; } elseif (in_array($x, $keep1, true)) { $parts[] = $x; } } // dedup orden estable $seen = []; $out = []; foreach ($parts as $p) { if (!isset($seen[$p])) { $seen[$p]=true; $out[]=$p; } } return $out; } /** --- FULLTEXT boolean query (solo título) --- */ private static function booleanQuery(string $input): string { $input = preg_replace("/[^\\p{L}\\p{N}\\s\"'\\+\\-\\*]/u", ' ', trim($input)); $len = mb_strlen($input, 'UTF-8'); $buf=''; $inQ=false; $out=[]; for ($i=0;$i<$len;$i++){ $ch = mb_substr($input,$i,1,'UTF-8'); if ($ch === '"'){ if ($inQ){ $buf.=$ch; if($buf!=='""') $out[] = $buf; $buf=''; $inQ=false; } else { if ($buf!==''){ $out[]=$buf; $buf=''; } $buf='"'; $inQ=true; } } elseif (preg_match('/\s/u',$ch)) { if ($inQ) $buf.=$ch; else { if($buf!==''){ $out[]=$buf; $buf=''; } } } else { $buf.=$ch; } } if ($buf!=='') $out[] = $inQ ? ($buf.'"') : $buf; $parts=[]; foreach ($out as $tok){ if ($tok==='' || mb_strlen($tok,'UTF-8')<2) continue; $U = strtoupper($tok); if ($tok[0]==='"' && substr($tok,-1)==='"') $parts[] = '+' . $tok; elseif (in_array($U, ['AND','OR','NOT'], true)) $parts[] = $U; else $parts[] = '+' . $tok . '*'; } return implode(' ', $parts); } /** --- Señales/boosts --- */ private static function coverageBoost(string $title, array $tokens): float { if (!$tokens) return 0.0; $t = self::asciiFold($title); $hit = 0; foreach ($tokens as $tok) { if ($tok !== '' && strpos($t, $tok) !== false) $hit++; } return ($hit / max(1, count($tokens))) * self::W_COVERAGE; } /** * Penaliza faltantes cuando la consulta es corta (≤4 tokens). * Ej.: "concreto f 250" -> si falta "f" en el título: castigo fuerte. */ private static function requiredTokensPenalty(string $title, array $tokens): float { $n = count($tokens); if ($n === 0 || $n > 4) return 0.0; $t = self::asciiFold($title); $hit = 0; foreach ($tokens as $tok) { if ($tok !== '' && strpos($t, $tok) !== false) $hit++; } $miss = $n - $hit; if ($miss <= 0) return 0.0; return - (self::REQ_BASE_PENALTY + self::REQ_MISS_PER_TOKEN * $miss); } private static function startsWithBoost(string $title, string $term): float { $a = self::asciiFold($title); $b = self::asciiFold($term); return str_starts_with($a, $b) ? self::W_STARTSWITH : 0.0; } private static function wordExactBoost(string $title, string $term): float { $a = ' ' . self::asciiFold($title) . ' '; $b = self::asciiFold($term); if ($b === '') return 0.0; return preg_match('/\b' . preg_quote($b, '/') . '\b/u', $a) ? self::W_WORD_EXACT : 0.0; } private static function recencyBoost(string $date): float { $d = strtotime($date); if (!$d) return 0.0; $days = max(1, (time() - $d) / 86400); return self::W_RECENCY_MAX / (1.0 + $days / 180.0); } private static function levenshteinSimilarity(string $a, string $b): float { $aa = substr(self::asciiFold($a), 0, 80); $bb = substr(self::asciiFold($b), 0, 80); if ($aa === '' || $bb === '') return 0.0; $dist = levenshtein($aa, $bb); $max = max(strlen($aa), strlen($bb)); return $max > 0 ? max(0.0, 1.0 - ($dist / $max)) : 0.0; } private static function tokenFuzzyBoost(string $title, array $tokens): float { if (!$tokens) return 0.0; $tw = array_slice(preg_split('/\s+/', self::asciiFold($title)), 0, 12); if (!$tw) return 0.0; $best = 0.0; foreach ($tokens as $tok) { $tokA = self::asciiFold($tok); foreach ($tw as $w) { if ($w === '' || $tokA === '') continue; $max = max(strlen($tokA), strlen($w)); if ($max === 0) continue; $sim = 1.0 - (levenshtein($tokA, $w) / $max); if ($sim > $best) $best = $sim; } } return max(0.0, $best) * self::W_FUZZY_TOKEN_MAX; } /* === Posiciones con control de bordes: - numérico puro: requiere no-alfaNum a ambos lados - alfabético puro: requiere no-alfaNum a ambos lados - mixto (letras+nums, ej. "15x15"): sólo exige borde izquierdo no-alfaNum, y RELAJA el derecho para permitir sufijos ("15x15cm"). */ private static function findPositions(string $foldedTitle, string $token): array { $T = $foldedTitle; $occ = []; $lenT = strlen($T); $lenK = strlen($token); if ($lenK === 0) return $occ; $hasLetter = (bool)preg_match('/[a-z]/', $token); $hasDigit = (bool)preg_match('/[0-9]/', $token); $isMixed = $hasLetter && $hasDigit; // p.ej. "15x15" $pureNum = !$hasLetter && $hasDigit; // p.ej. "10" $pureAlpha = $hasLetter && !$hasDigit; // p.ej. "dala" $pos = 0; while (true) { $p = strpos($T, $token, $pos); if ($p === false) break; $left = ($p > 0) ? $T[$p-1] : ' '; $right = ($p+$lenK < $lenT) ? $T[$p+$lenK] : ' '; $leftOk = !ctype_alnum($left); // siempre exigimos borde izq. $rightOk = $isMixed ? true : !ctype_alnum($right); // relajado sólo para mixtos if ($leftOk && $rightOk) { $occ[] = [$p, $p+$lenK]; } $pos = $p + 1; } return $occ; } /** ===== Proximidad por tramo en caracteres ===== */ private static function proximityBoost(string $title, array $tokens): float { $tokens = array_values(array_unique(array_filter($tokens, fn($t)=>$t !== ''))); if (count($tokens) < 2) return 0.0; $T = self::asciiFold($title); $occ = []; foreach ($tokens as $tok) { foreach (self::findPositions($T, $tok) as $p) { $occ[] = ['pos'=>$p[0], 'end'=>$p[1], 'tok'=>$tok]; } } if (!$occ) return 0.0; usort($occ, fn($a,$b)=> $a['pos'] <=> $b['pos']); $present = []; foreach ($occ as $o) $present[$o['tok']] = true; $needCount = count($present); if ($needCount < 2) return 0.0; $cnt = []; $covered = 0; $bestSpan = PHP_INT_MAX; $bestL = 0; $bestR = -1; for ($r=0,$l=0; $r$t !== ''))); if (count($tokens) < 2) return 0.0; $T = self::asciiFold($title); $posList = []; foreach ($tokens as $t) { $posList[$t] = self::findPositions($T, $t); if (!$posList[$t]) return 0.0; // si falta un token, no hay ventana ordenada } $bestSpanChars = PHP_INT_MAX; $bestStart = -1; $t0 = $tokens[0]; foreach ($posList[$t0] as $p0) { $start = $p0[0]; $end = $p0[1]; $ok = true; $cursor = $end; for ($i=1; $i= $cursor) { $end = max($end, $pp[1]); $cursor = $pp[1]; $found = true; break; } } if (!$found) { $ok = false; break; } } if ($ok) { $span = $end - $start; if ($span < $bestSpanChars) { $bestSpanChars = $span; $bestStart = $start; } } } if ($bestSpanChars === PHP_INT_MAX) return 0.0; $slice = substr($T, max(0,$bestStart), max(1,$bestSpanChars)); $wordsInSpan = max(1, count(array_filter(explode(' ', $slice)))); $tightness = count($tokens) / $wordsInSpan; $score = self::W_ORDERED_WINDOW * $tightness; if ($bestStart <= 6) $score += self::W_ORDERED_ANCHOR; return $score; } /** Penalización por títulos excesivamente largos. */ private static function lengthPenalty(string $title): float { $len = mb_strlen($title, 'UTF-8'); if ($len <= self::LEN_PEN_START) return 0.0; $extra = $len - self::LEN_PEN_START; return - min(300.0, $extra * self::LEN_PEN_PER_CHAR); } /** --- Buckets de obtención (solo título) --- */ private function fetchEquals(string $term, int $limit): array { $sql = "SELECT ID, post_title, post_date FROM {$this->table} WHERE post_type='post' AND post_status='publish' AND post_title COLLATE utf8mb4_general_ci = :t ORDER BY post_date DESC LIMIT :lim"; $st = $this->pdo->prepare($sql); $st->bindValue(':t',$term); $st->bindValue(':lim',$limit,PDO::PARAM_INT); $st->execute(); $rows = $st->fetchAll(); Logger::log($this->logEnabled,'bucket_equals',['count'=>count($rows)]); return $rows; } private function fetchStartsWith(string $term, int $limit): array { $sql = "SELECT ID, post_title, post_date FROM {$this->table} WHERE post_type='post' AND post_status='publish' AND post_title LIKE :p ESCAPE '\\\\' ORDER BY post_date DESC LIMIT :lim"; $prefix = str_replace(['\\','%','_'], ['\\\\','\%','\_'], $term) . '%'; $st = $this->pdo->prepare($sql); $st->bindValue(':p',$prefix); $st->bindValue(':lim',$limit,PDO::PARAM_INT); $st->execute(); $rows = $st->fetchAll(); Logger::log($this->logEnabled,'bucket_starts',['count'=>count($rows)]); return $rows; } private function fetchFulltextTitle(string $term, int $limit): array { $q = self::booleanQuery($term); if ($q === '') return []; $sql = "SELECT ID, post_title, post_date, MATCH(post_title) AGAINST (:q IN BOOLEAN MODE) AS raw_rel FROM {$this->table} WHERE post_type='post' AND post_status='publish' AND MATCH(post_title) AGAINST (:q IN BOOLEAN MODE) ORDER BY raw_rel DESC, post_date DESC LIMIT :lim"; $st = $this->pdo->prepare($sql); $st->bindValue(':q',$q); $st->bindValue(':lim',$limit,PDO::PARAM_INT); $st->execute(); $rows = $st->fetchAll(); Logger::log($this->logEnabled,'bucket_fulltext',['q'=>$q,'count'=>count($rows)]); return $rows; } /** TODOS los tokens por LIKE: sube el recall de candidatos buenos. */ private function fetchAllTokensLike(array $tokens, int $limit): array { if (!$tokens) return []; $likeConds = []; foreach ($tokens as $i=>$t) { $likeConds[] = "post_title LIKE :lk{$i} ESCAPE '\\\\'"; } $where = implode(' AND ', $likeConds); $sql = "SELECT ID, post_title, post_date FROM {$this->table} WHERE post_type='post' AND post_status='publish' AND {$where} ORDER BY post_date DESC LIMIT :lim"; $st = $this->pdo->prepare($sql); foreach ($tokens as $i=>$t) { $st->bindValue(":lk{$i}", '%' . str_replace(['\\','%','_'], ['\\\\','\%','\_'], $t) . '%'); } $st->bindValue(':lim',$limit,PDO::PARAM_INT); $st->execute(); $rows = $st->fetchAll(); Logger::log($this->logEnabled,'bucket_like_all',['count'=>count($rows)]); return $rows; } private function fetchContains(string $term, int $limit): array { $like = '%' . str_replace(['\\','%','_'], ['\\\\','\%','\_'], $term) . '%'; $sql = "SELECT ID, post_title, post_date FROM {$this->table} WHERE post_type='post' AND post_status='publish' AND post_title LIKE :l ESCAPE '\\\\' ORDER BY post_date DESC LIMIT :lim"; $st = $this->pdo->prepare($sql); $st->bindValue(':l',$like); $st->bindValue(':lim',$limit,PDO::PARAM_INT); $st->execute(); $rows = $st->fetchAll(); Logger::log($this->logEnabled,'bucket_contains',['count'=>count($rows)]); return $rows; } /** --- Merge + dedupe + re-rank --- */ public function run(string $term, int $limit, int $offset): array { $t0 = microtime(true); $tokens = self::tokens($term); // Tamaños de pool $capFull = max(120, min(300, $limit * 8)); $capLike = max(120, min(300, $limit * 8)); $capPref = max(60, min(200, $limit * 4)); $capEq = min(40, $limit * 2); $capCont = max(80, min(240, $limit * 6)); $buckets = [ ['name'=>'LIKE_ALL', 'base'=>900.0, 'rows'=>$this->fetchAllTokensLike($tokens, $capLike)], ['name'=>'FULLTEXT', 'base'=>700.0, 'rows'=>$this->fetchFulltextTitle($term, $capFull)], ['name'=>'STARTS', 'base'=>650.0, 'rows'=>$this->fetchStartsWith($term, $capPref)], ['name'=>'CONTAINS', 'base'=>500.0, 'rows'=>$this->fetchContains($term, $capCont)], ['name'=>'EQUALS', 'base'=>1200.0, 'rows'=>$this->fetchEquals($term, $capEq)], ]; // Deduplicar por título normalizado $seen = []; $pool = []; foreach ($buckets as $b) { foreach ($b['rows'] as $r) { $norm = self::normTitle($r['post_title']); if (isset($seen[$norm])) continue; $seen[$norm] = true; $pool[] = [ 'ID' => (int)$r['ID'], 'post_title' => (string)$r['post_title'], 'post_date' => (string)$r['post_date'], 'bucket' => $b['name'], 'baseW' => (float)$b['base'], 'raw_rel' => isset($r['raw_rel']) ? (float)$r['raw_rel'] : 0.0, ]; } } $poolTotal = count($pool); if ($poolTotal === 0) { $elapsed = round((microtime(true)-$t0)*1000,2); Logger::log($this->logEnabled,'consulta_ejecutada',[ 'modo'=>'HYBRID','total'=>0,'filas'=>0,'tiempo_ms'=>$elapsed,'pool_total'=>0 ]); return ['total'=>0,'rows'=>[],'modo'=>'HYBRID','time_ms'=>$elapsed]; } // Re-rank foreach ($pool as &$it) { $title = $it['post_title']; $date = $it['post_date']; $rawRel = $it['raw_rel']; $baseW = $it['baseW']; $score = $baseW + ($rawRel * self::RAW_REL_MULT) + self::coverageBoost($title, $tokens) + self::orderedWindowBoost($title, $tokens) // *** manda *** + self::proximityBoost($title, $tokens) // cercano (no necesariamente ordenado) + self::startsWithBoost($title, $term) + self::wordExactBoost($title, $term) + (self::levenshteinSimilarity($title, $term) * 160.0) + self::tokenFuzzyBoost($title, $tokens) + self::recencyBoost($date) + self::lengthPenalty($title) + self::requiredTokensPenalty($title, $tokens); // *** nuevo castigo si faltan tokens $it['score'] = $score; } unset($it); usort($pool, function($a,$b){ if ($a['score'] === $b['score']) { return strcmp($b['post_date'], $a['post_date']); } return ($a['score'] < $b['score']) ? 1 : -1; }); // Paginación $pageRows = array_slice($pool, $offset, $limit); $rows = array_map(fn($r)=>[ 'ID' => $r['ID'], 'post_title' => $r['post_title'], 'post_date' => $r['post_date'], // para depurar: 'score'=>$r['score'], 'bucket'=>$r['bucket'] ], $pageRows); $elapsed = round((microtime(true)-$t0)*1000,2); Logger::log($this->logEnabled,'consulta_ejecutada',[ 'modo'=>'HYBRID', 'total'=>$poolTotal, 'filas'=>count($rows), 'tiempo_ms'=>$elapsed, 'pool_total'=>$poolTotal, 'buckets'=>[ 'like_all'=>count($buckets[0]['rows']), 'fulltext'=>count($buckets[1]['rows']), 'starts'=>count($buckets[2]['rows']), 'contains'=>count($buckets[3]['rows']), 'equals'=>count($buckets[4]['rows']), ], ]); return ['total'=>$poolTotal,'rows'=>$rows,'modo'=>'HYBRID','time_ms'=>$elapsed]; } }