#! /usr/bin/env python #! -*- coding:utf-8 -*- #====#====#====#==== #!@Author : px #!@time : 2020/3/26 16:08 #!@File : 3-26-1 #====#====#====#=== #拼写纠正 import sys import math # defread_file(filename): try: fp=open(filename) text=fp.read() except IOError: print(filename,"文件打开失败") sys.exit() #print(text) return text #分词 import re defwords(text): #1 #A=re.findall('[a-z]+',text.lower()) #2 A = re.findall(r'\w+\'?\w+', text.lower()) #print(A) return A #统计词频 语言模型训练 import collections deftrain(features): #初始化模型 得到默认字典 value=1 model=collections.defaultdict(lambda :1) for f in features: model[f]= model[f]+1 #print("model: ",model) return model A=words(read_file("big.txt")) ############################### #添加的 from collections import Counter WORDS=Counter(A) ############################# NWORDS=train(A) #print(len(A)) P=sum(NWORDS.values()) #print(NWORDS["the"],NWORDS["they"],NWORDS["that"])
#判读单词是否在字典,错误则纠正 defknown(words):#不在则返回空 return (set(w for w in words if w in NWORDS))
alphabeta='qwertyuiopasdfghjklzxcvbnm\'' #编辑距离为的可能单词 defedist1(word): n=len(word) s1=[word[0:i]+word[i+1:] for i inrange(n)] #print(s1)#删除单词中的字母 s2=[word[0:i]+word[i]+word[i+2:] for i inrange(n-1)] #print(s2)# 单词字母错位 s3=[word[0:i]+c+word[i+1:] for i inrange(n) for c in alphabeta] #print(s3,len(s3))#字母替换 s4=[word[0:i]+c+word[i:] for i inrange(n) for c in alphabeta] #print(s4,len(s4))#字母插入 return (set(s1+s2+s3+s4)) #编辑距离为2,递归 defedist2(word): return (set(e2 for e1 in edist1(word) for e2 in edist1(e1))) defknown_edist2(word): return (set(e2 for e1 in edist1(word) for e2 in edist1(e1) if e2 in NWORDS)) #选择最大可能的单词 defcorrect(word): #短路运算 candidates 列表 candidates=known([word]) or known(edist1(word)) or known(edist2(word)) or [word] returnmax(candidates,key=lambda w:NWORDS[w]) defcount(): batch0=read_file('batch0.tab.txt') batch0words=words(batch0) print(len(batch0words),batch0words[0],batch0words[1]) con=0 for i inrange(len(batch0words)//2): if (correct(batch0words[2*i]) == batch0words[2*i+1]): con+=1 #print("con",con) #print("i",i) print(con/(len(batch0words)*0.5))
defread_file_lines(filename): try: fp=open(filename) text=fp.readlines() except IOError: print(filename,"文件打开失败") sys.exit() #print(text) return text defcount2(): con=0 words_line=read_file_lines("3.txt")#一排一排的记录 #print(len(words_line),words_line[0],words_line[1]) for linewords in words_line:#取一排 B=words(linewords)#一排转换为单词列表 if correct(B[0]) in B: con+=1 print(con/len(words_line))
############################# #添加的 #将unit_test删除了 defspelltest(tests, verbose=False): "Run correction(wrong) on all (right, wrong) pairs; report results." import time start = time.perf_counter() good, unknown = 0, 0 n = len(tests) for right, wrong in tests: w = correct(wrong) good += (w == right) if w != right: unknown += (right notin WORDS) if verbose: print('correction({}) => {} ({}); expected {} ({})' .format(wrong, w, WORDS[w], right, WORDS[right])) dt = time.perf_counter() - start print('{:.0%} of {} correct ({:.0%} unknown) at {:.0f} words per second ' .format(good / n, n, unknown / n, n / dt))
defTestset(lines): "Parse 'right: wrong1 wrong2' lines into [('right', 'wrong1'), ('right', 'wrong2')] pairs." return [(right, wrong) for (right, wrongs) in (line.split(':') for line in lines) for wrong in wrongs.split()]
# print(unit_tests()) # spelltest(Testset(open('spell-testset1.txt'))) # Development set # spelltest(Testset(open('spell-testset2.txt'))) # Final test set ############################