Python基础:英文拼写检查器
本文使用Python实现一个简单的英文拼写检查器。
程序主要功能描述如下
- 从用户输入或指定文件读取文本内容
- 文本预处理:忽略非英文内容
- 逐一检查单词拼写(检测单词是否存在于单词库EnglishWords.txt中)
- 针对错误单词,可由用户选择忽略、问号标记、添加到词典、推荐正确单词等选项
- 使用字符串契合度作为指标推荐正确单词
- 生成并输出统计数据
- 将统计数据和修订后的文本存储到指定文件中
- 菜单美化:添加边框
- 错误处理:防止程序因用户输入或文件读写等异常原因崩溃
# --- RyanXin SpellChecker ---
import os
import time
from difflib import SequenceMatcher
class SpellChecker():
def __init__(self):
'''start the process of SpellChecker'''
# initialize English words list
if not os.path.exists("EnglishWords.txt"): # file check
print("* Error: can not find the file \"EnglishWords.txt\".")
return
# use the hash set rather than list to speed up matching process
self.wordList = set(line.rstrip('\n') for line in open("EnglishWords.txt", "r"))
# initialize user dictionary which was kept in the file
if os.path.exists("userDictionary.txt"):
self.dictionary = set(line.rstrip() for line in open("userDictionary.txt", "r") if line.rstrip())
else:
self.dictionary = set()
# main loop starts from here
print("Welcome to use Spell Checker!" ,end='')
while True:
if self.displayMenu() == -1:
break # quit
self.checkingWords()
self.generateResults()
while True:
isRecheck = input("Return to the main menu? (y/n): ").strip().lower()
if isRecheck in ('yes','y','no','n'):
break
if isRecheck[0] == 'n':
break # quit
# store user dictionary
with open("userDictionary.txt", "w") as f:
for w in self.dictionary:
f.write(w + '\n')
def displayMenu(self):
''' Main Menu: Work mode selection'''
while True:
print()
# display main menu
self.formatDisplay("1. Spell check a sentence;\n2. Spell check a file;\n0. Quit.", "Work mode selection menu.")
while True: # validate user input
mode = input("What do you want to do: ").strip()
if mode in ('1','2','0'):
break
else:
print("* Invalid input. Please choose again.")
if mode == '1': # Spell check a sentence
self.texts = input("\nPlease enter a sentence: ").strip()
if self.texts: # check empty string
break
else:
print("* The sentence cannot be empty.")
# return to main menu
continue
elif mode == '2': # Spell check a file
filename = input("\nPlease enter a filename: ").strip()
if not filename: # check empty string
print("* The file name cannot be empty.")
continue
if os.path.exists(filename): # filename validation
try:
with open(filename, 'r') as f:
self.texts = f.read()
break
except: # possible reading error, e.g. non-text files
print("* Error: unexpected error occurred when reading your file.")
continue
else:
print("* Error: The file does not exist.")
continue
else: # quit
return -1
return 0
def checkingWords(self):
'''word checking process'''
self.start_time = time.perf_counter()
self.texts = self.texts.split() # split original input
self.original_length = len(self.texts)
self.total_number = 0 # non-empty English words
self.correct_number = 0 # words spelt correctly
self.incorrect_number = 0 # words spelt incorrectly
self.added_number = 0 # words added to the user dictionary
self.accepted_number = 0 # words changed by the user accepting the suggested word
# iterate with every word
for i,word in enumerate(self.texts):
raw = self.texts[i]
# filter out non-alpha characters and convert to lower case letters
word = ''.join(x for x in word if x.isalpha()).lower()
# store processed word
self.texts[i] = word
if not word: # skip empty word
print("\nSkipped non-alpha word \"" + raw + "\". (" + str(i+1) + "/" + str(self.original_length) + ")")
continue
self.total_number += 1
print("\nchecking word: \"" + word + "\". (" + str(i+1) + "/" + str(self.original_length) + ")")
# words in wordlist or user dictionary will be treated as correct
if word in self.wordList or word in self.dictionary:
self.correct_number += 1
print('OK')
else:
# deal with incorrect word
self.handleIncorrectWord(i, word)
self.end_time = time.perf_counter()
def handleIncorrectWord(self, i, word):
'''deal with incorrect word'''
print("Encountered an incorrect word.")
self.formatDisplay("--> " + word + "\n\n1. Ignore;\n2. Mark;\n3. Add to dictionary;\n4. Suggest likely correct spelling.")
while True: # validate user input
handle_method = input("Please choose the way to handle the incorrect word: ").strip()
if handle_method in ('1','2','3','4'):
break
else:
print("* Invalid input. Please choose again.")
def markWord(word):
'''mark word and update original texts'''
marked = '?' + word + '?'
self.texts[i] = marked
print("\"" + word + "\" has been marked as \"" + marked + "\".")
if handle_method == '1':
# ignore
self.incorrect_number += 1
print("\"" + word + "\" has been ignored.")
elif handle_method == '2':
# mark
self.incorrect_number += 1
markWord(word)
elif handle_method == '3':
# add to dictionary
self.correct_number += 1
self.dictionary.add(word)
self.added_number += 1
print("\"" + word + "\" has been added into the dictionary and will be treated as correct word in the future.")
else:
# give suggestion
suggest_word = ('', 0) # (suggested word, max matching ratio)
for w in self.wordList.union(self.dictionary):
# match with every word in word list and user dictionary
score = SequenceMatcher(None, word, w).ratio()
if score > suggest_word[1]:
suggest_word = (w, score)
# ask user accept or not
print("- A possible suggestion is \"" + suggest_word[0] + "\".")
while True:
isAccept = input("Accept this suggestion? (y/n): ").strip().lower()
if isAccept in ('yes','y','no','n'):
break
if isAccept[0] == 'y':
# accept the suggestion
self.correct_number += 1
self.accepted_number += 1
# modify word and update original texts
self.texts[i] = suggest_word[0]
print("\"" + word + "\" has been changed to \"" + suggest_word[0] + "\".")
else:
# reject the suggestion
self.incorrect_number += 1
print("The suggestion has been rejected.")
# if reject, the word will be marked for user to further check
markWord(word)
def generateResults(self):
# generate summary statistics
statistics = "- Original words number: " + str(self.original_length) + ";\n"
statistics += "- Total number of English words: " + str(self.total_number) + ";\n"
statistics += "- Words spelt correctly: " + str(self.correct_number) + ";\n"
statistics += "- Words spelt incorrectly: " + str(self.incorrect_number) + ";\n"
statistics += "- Words added to the dictionary: " + str(self.added_number) + ";\n"
statistics += "- Suggested words accepted: " + str(self.accepted_number) + ";\n"
statistics += "- Spellcheck at: " + time.asctime(time.localtime(time.time()))[4:] + ";\n"
statistics += "- The amount of time elapsed: {:.2f}s.".format(self.end_time - self.start_time)
print()
self.formatDisplay(statistics, "Summary Statistics")
# generate file with summary statistics and checked texts
while True:
try:
while True: # ensure file name is not empty
result_file_name = input("Name of new file with results: ").strip()
if not result_file_name: # check empty string
print("* The file name cannot be empty.")
else:
break
with open(result_file_name, 'w') as f:
f.write("Spellcheck Statistics\n")
f.write(statistics)
f.write('\n\n')
f.write(' '.join(w for w in self.texts if w)) # skip empty words
f.write('\n')
print("File \"" + result_file_name + "\" has been successfully created.\n")
break
except: # possible error: invalid file name characters, etc.
print("* Error: unexpected error occurred when writing into the file. Please try again.")
def formatDisplay(self, contents, title=None, width=40):
'''beautify and display contents with borders'''
contents = contents.split('\n')
# the top border
print("╔" + "═"*width + "╗")
# display title area (center)
if title:
left_distance = (width-len(title))//2
right_distance = width - len(title) - left_distance
print("║" + " "*left_distance + title + " "*right_distance + "║") # center the title
print("║" + " " + "─"*(width-2) + " " + "║") # Dividing line
# display all contents (left align)
for line in contents:
print("║" + line + " "*(width-len(line)) + "║")
# the bottom border
print("╚" + "═"*width + "╝")
if __name__ == '__main__':
SpellChecker()
import os
import time
from difflib import SequenceMatcher
class SpellChecker():
def __init__(self):
'''start the process of SpellChecker'''
# initialize English words list
if not os.path.exists("EnglishWords.txt"): # file check
print("* Error: can not find the file \"EnglishWords.txt\".")
return
# use the hash set rather than list to speed up matching process
self.wordList = set(line.rstrip('\n') for line in open("EnglishWords.txt", "r"))
# initialize user dictionary which was kept in the file
if os.path.exists("userDictionary.txt"):
self.dictionary = set(line.rstrip() for line in open("userDictionary.txt", "r") if line.rstrip())
else:
self.dictionary = set()
# main loop starts from here
print("Welcome to use Spell Checker!" ,end='')
while True:
if self.displayMenu() == -1:
break # quit
self.checkingWords()
self.generateResults()
while True:
isRecheck = input("Return to the main menu? (y/n): ").strip().lower()
if isRecheck in ('yes','y','no','n'):
break
if isRecheck[0] == 'n':
break # quit
# store user dictionary
with open("userDictionary.txt", "w") as f:
for w in self.dictionary:
f.write(w + '\n')
def displayMenu(self):
''' Main Menu: Work mode selection'''
while True:
print()
# display main menu
self.formatDisplay("1. Spell check a sentence;\n2. Spell check a file;\n0. Quit.", "Work mode selection menu.")
while True: # validate user input
mode = input("What do you want to do: ").strip()
if mode in ('1','2','0'):
break
else:
print("* Invalid input. Please choose again.")
if mode == '1': # Spell check a sentence
self.texts = input("\nPlease enter a sentence: ").strip()
if self.texts: # check empty string
break
else:
print("* The sentence cannot be empty.")
# return to main menu
continue
elif mode == '2': # Spell check a file
filename = input("\nPlease enter a filename: ").strip()
if not filename: # check empty string
print("* The file name cannot be empty.")
continue
if os.path.exists(filename): # filename validation
try:
with open(filename, 'r') as f:
self.texts = f.read()
break
except: # possible reading error, e.g. non-text files
print("* Error: unexpected error occurred when reading your file.")
continue
else:
print("* Error: The file does not exist.")
continue
else: # quit
return -1
return 0
def checkingWords(self):
'''word checking process'''
self.start_time = time.perf_counter()
self.texts = self.texts.split() # split original input
self.original_length = len(self.texts)
self.total_number = 0 # non-empty English words
self.correct_number = 0 # words spelt correctly
self.incorrect_number = 0 # words spelt incorrectly
self.added_number = 0 # words added to the user dictionary
self.accepted_number = 0 # words changed by the user accepting the suggested word
for i,word in enumerate(self.texts): # iterate with every word
word = self.processWord(word)
if not word: # skip empty word
print("\nSkipped non-alpha word \"" + self.texts[i] + "\". (" + str(i+1) + "/" + str(self.original_length) + ")")
continue
self.total_number += 1
print("\nchecking word: \"" + word + "\". (" + str(i+1) + "/" + str(self.original_length) + ")")
# words in wordlist or user dictionary will be treated as correct
if word in self.wordList or word in self.dictionary:
self.correct_number += 1
print('OK')
else:
# deal with incorrect word
self.handleIncorrectWord(i, word)
self.end_time = time.perf_counter()
def processWord(self, word):
'''extract pure English word'''
wd = ''.join(x for x in word if x.isalpha()) # filter out non-alpha characters
wd = wd.lower() # transform into lower case letters
if not wd: # check if wd is empty
return ''
# back up non-alpha characters in both sides of the word
for l in range(len(word)):
if word[l].isalpha():
break
for r in range(len(word)-1, -1, -1):
if word[r].isalpha():
break
self.left_sides = word[:l]
self.right_sides = word[r+1:]
return wd # return processed word
def handleIncorrectWord(self, i, word):
'''deal with incorrect word'''
print("Encountered an incorrect word.")
self.formatDisplay("--> " + word + "\n\n1. Ignore;\n2. Mark;\n3. Add to dictionary;\n4. Suggest likely correct spelling.")
while True: # validate user input
handle_method = input("Please choose the way to handle the incorrect word: ").strip()
if handle_method in ('1','2','3','4'):
break
else:
print("* Invalid input. Please choose again.")
def markWord(word):
'''mark word and restore non-alpha characters'''
marked = '?' + word + '?'
self.texts[i] = self.left_sides + marked + self.right_sides
print("\"" + word + "\" has been marked as \"" + marked + "\".")
if handle_method == '1':
# ignore
self.incorrect_number += 1
print("\"" + word + "\" has been ignored.")
elif handle_method == '2':
# mark
self.incorrect_number += 1
markWord(word)
elif handle_method == '3':
# add to dictionary
self.correct_number += 1
self.dictionary.add(word)
self.added_number += 1
print("\"" + word + "\" has been added into the dictionary and will be treated as correct word in the future.")
else:
# give suggestion
suggest_word = ('', 0) # (suggested word, max matching ratio)
for w in self.wordList.union(self.dictionary):
# match with every word in word list and user dictionary
score = SequenceMatcher(None, word, w).ratio()
if score > suggest_word[1]:
suggest_word = (w, score)
# ask user accept or not
print("- A possible suggestion is \"" + suggest_word[0] + "\".")
while True:
isAccept = input("Accept this suggestion? (y/n): ").strip().lower()
if isAccept in ('yes','y','no','n'):
break
if isAccept[0] == 'y':
# accept the suggestion
self.correct_number += 1
self.accepted_number += 1
# modify word and restore non-alpha characters
self.texts[i] = self.left_sides + suggest_word[0] + self.right_sides
print("\"" + word + "\" has been changed to \"" + suggest_word[0] + "\".")
else:
# reject the suggestion
self.incorrect_number += 1
print("The suggestion has been rejected.")
markWord(word)
def generateResults(self):
# generate summary statistics
statistics = "- Original words number: " + str(self.original_length) + ";\n"
statistics += "- Total number of English words: " + str(self.total_number) + ";\n"
statistics += "- Words spelt correctly: " + str(self.correct_number) + ";\n"
statistics += "- Words spelt incorrectly: " + str(self.incorrect_number) + ";\n"
statistics += "- Words added to the dictionary: " + str(self.added_number) + ";\n"
statistics += "- Suggested words accepted: " + str(self.accepted_number) + ";\n"
statistics += "- Spellcheck at: " + time.asctime(time.localtime(time.time()))[4:] + ";\n"
statistics += "- The amount of time elapsed: {:.2f}s.".format(self.end_time - self.start_time)
print()
self.formatDisplay(statistics, "Summary Statistics")
# generate file with summary statistics and checked texts
while True:
try:
while True: # ensure file name is not empty
result_file_name = input("Name of new file with results: ").strip()
if not result_file_name: # check empty string
print("* The file name cannot be empty.")
else:
break
with open(result_file_name, 'w') as f:
f.write("Spellcheck Statistics\n")
f.write(statistics)
f.write('\n\n')
f.write(' '.join(self.texts))
f.write('\n')
print("File \"" + result_file_name + "\" has been successfully created.\n")
break
except: # possible error: invalid file name characters, etc.
print("* Error: unexpected error occurred when writing into the file. Please try again.")
def formatDisplay(self, contents, title=None, width=40):
'''beautify and display contents with borders'''
contents = contents.split('\n')
# the top border
print("╔" + "═"*width + "╗")
# display title area (center)
if title:
left_distance = (width-len(title))//2
right_distance = width - len(title) - left_distance
print("║" + " "*left_distance + title + " "*right_distance + "║") # center the title
print("║" + " " + "─"*(width-2) + " " + "║") # Dividing line
# display all contents (left align)
for line in contents:
print("║" + line + " "*(width-len(line)) + "║")
# the bottom border
print("╚" + "═"*width + "╝")
if __name__ == '__main__':
SpellChecker()