Spellcheck
10. 9. 2012 (před 10 lety) — k47 (CC by)
<?php // PHP implementation of http://norvig.com/spell-correct.html in 49 lines of code // based on http://soundofemotion.com/spellcorrect.txt which is much longer function words($text) { return preg_split("~[^a-z]+~", $text, null, PREG_SPLIT_NO_EMPTY); } function train($features) { return array_count_values($features); } function edits1($word) { $edits = array(); $alphabet = "abcdefghijklmnopqrstuvwxyz"; // deletion for ($x = 0; $x < strlen($word); $x++) $edits[] = substr($word, 0, $x) . substr($word, $x+1, strlen($word)); // transposition for ($x = 0; $x < strlen($word)-1; $x++) $edits[] = substr($word, 0, $x) . $word[$x+1] . $word[$x] . substr($word, $x+2, strlen($word)); // alteration for ($c = 0; $c < strlen($alphabet); $c++) for ($x = 0; $x < strlen($word); $x++) $edits[] = substr($word, 0, $x) . $alphabet[$c] . substr($word, $x+1, strlen($word)); // insertion for ($c = 0; $c < strlen($alphabet); $c++) for ($x = 0; $x < strlen($word) + 1; $x++) $edits[] = substr($word, 0, $x) . $alphabet[$c] . substr($word, $x, strlen($word)); return array_unique($edits); } function known_edits2($word, $nwords) { $edits = array(); foreach (edits1($word) as $e1) foreach (edits1($e1) as $e2) if (isset($nwords[$e2])) $edits[] = $e2; return array_unique($edits); } function known($words, $nwords) { return array_flip(array_intersect_key(array_flip($words), $nwords)); } function candidates($word, $nwords) { if (known(array($word), $nwords)) return array($word); if ($known = known(edits1($word), $nwords)) return $known; if ($known = known_edits2($word, $nwords)) return $known; return array($word); } function correct($word, $nwords) { $candidates = array_flip(candidates($word, $nwords)); foreach ($candidates as $word => &$weight) $weight = isset($nwords[$word]) ? $nwords[$word] : 1; arsort($candidates, SORT_NUMERIC); reset($candidates); return key($candidates); } $nwords = train(words(file_get_contents("big.txt"))); echo correct("thay", $nwords), "\n";