import itertools
from collections import Counter
import editdistance
import nltk
corpus = list(nltk.corpus.brown.words())
unigrams = corpus
bigrams = list(nltk.bigrams(corpus))
unigram_lm = Counter(unigrams)
for k in unigram_lm:
unigram_lm[k] /= len(unigrams)
bigram_lm = Counter(bigrams)
for k in bigram_lm:
bigram_lm[k] /= len(bigrams)
dictionary = set(corpus)
lm_lambda = 0.001
def error_model(query, word):
return 1 - editdistance.eval(query, word) / max(len(query), len(word))
def language_model(words):
if len(words) == 1:
return unigram_lm[words[0]]
else:
prob = 1
for i in range(len(words) - 1):
prob *= (bigram_lm[words[i], words[i + 1]] * 0.5 + unigram_lm[words[i]] * 0.5)
prob *= unigram_lm[words[len(words) - 1]]
return prob
def score_sent(sent):
score = 1
for i in range(len(sent)):
word, error_score = sent[i]
score *= error_score
score *= language_model([word for word, s in sent]) ** lm_lambda
return score
while True:
query = input('\nquery: ').split()
candidate_words = []
for i in range(len(query)):
word_and_scores = [(word, error_model(query[i], word)) for word in dictionary]
top_words = sorted(word_and_scores, key=lambda x: x[1], reverse=True)[:10]
candidate_words.append(top_words)
print('\ncandidate words:')
for c in candidate_words:
print(c)
candidate_sentences = list(itertools.product(*candidate_words))
sents_with_scores = [(sent, score_sent(sent)) for sent in candidate_sentences]
print('\ndid you mean...')
for sent, score in sorted(sents_with_scores, key=lambda x: x[1], reverse=True)[:10]:
print(score, '\t', ' '.join([word for word, score in sent]))