Darren Redmond Developing Out Spell Checking Capabilities Based on the tutorial below and well see where we end up in class http wwwopenbookprojectnetpy4funspellCheck Create a file called ID: 579989
Download Presentation The PPT/PDF document "Programming For Big Data" is the property of its rightful owner. Permission is granted to download and print the materials on this web site for personal, non-commercial use only, and to display it on your personal computer provided you do not modify the materials and that you retain all copyright notices contained in the materials. By downloading content from our website, you accept the terms of this agreement.
Slide1
Programming For Big Data
Darren RedmondSlide2
Developing Out Spell Checking Capabilities
Based on the tutorial below – and we’ll see where we end up in
class
http
://www.openbookproject.net/py4fun/spellCheck
/
Create a file called
spellcheck.py
with the following:
# to begin we start with a list of words in a file called
spell.words
# we read the file and strip out the file endings
words
= open
('
spell.words
'
).
readlines
()
words = map(lambda
x:
x.strip
(),
words)
# now check if the word zygotic is a word
p
rint('zygotic
' in
words)
p
ython
spellcheck.pySlide3
Create Functions For Reuse
#
create a function to load the words and to check a word is in the
dictionary
def
load_words
(
file_name
):
words = open(
file_name
).
readlines
()
words = map(lambda x:
x.strip
(), words)
return words
def
check_word
(words, word
):
return word in words
w
ords =
load_words
('
spell.words
')
# now check if the word zygotic is a word
print(
check_word
(words, 'zygotic'))
p
ython
spellcheck.pySlide4
Create
Function To Check A Sentence
…
def
check_words
(words, sentence):
words_to_check
=
sentence.split
(' ')
for word in
words_to_check
:
if not
check_word
(words, word):
print(
'
Word is misspelt :
'
+ word)
return False
return True
words =
load_words
('
spell.words
')
p
rint(
check_word
(words, 'zygotic'))
p
rint(
check_words
(words, 'zygotic
mistasdas
elementary'))Slide5
The Full Program Now
def
load_words
(
file_name
):
words = open(
file_name
).
readlines
()
words = map(lambda x:
x.strip
(), words
)
return words
def
check_word
(words, word
):
return word in
words
d
ef
check_words
(words, sentence):
words_to_check
=
sentence.split
(
'
')
for word in
words_to_check
:
if not
check_word
(words, word):
print(
'
Word is misspelt :
'
+ word)
return False
return True
w
ords =
load_words
('
spell.words
')
p
rint(
check_word
(words, 'zygotic'))
p
rint(
check_words
(words, 'zygotic
mistasdas
elementary'))Slide6
Create Spell Checker Class
class
SpellChecker
(object
):
def
__
init
__(self):
self.words
=
[]
def
load_words(self, file_name
):
self.words
= open(
file_name
).
readlines
()
self.words
= map(lambda x:
x.strip
(),
self.words
)
def
check_word
(self, word
):
return
word in
self.words
def
check_words
(self, sentence):
words_to_check
=
sentence.split
(
'
')
for word in
words_to_check
:
if not
self.check_word
(word):
print(
'
Word is misspelt :
'
+ word)
return False
return TrueSlide7
Use the Spell Checker Class
…
# enable so that this is only called when the script run from the command line
if __name__ == '__main__':
spellChecker
= SpellChecker()
spellChecker.load_words
('spell.words')
print(spellChecker.check_word
('zygotic'))
print(spellChecker.check_words
('zygotic mistasdas elementary'))Slide8
Let’s Create Some Unit Tests – spellcheck_test.py
import
unittest
f
rom spellcheck import SpellChecker
class TestSpellChecker(unittest.TestCase):
def
setUp(self
):
self.spellChecker
=
SpellChecker()
self.spellChecker.load_words('spell.words
'
)
def test_spell_checker(self):
self.assertTrue(self.spellChecker.check_word('zygotic
'))
self.assertFalse(self.spellChecker.check_words('zygotic
mistasdas elementary'))
self.assertTrue(self.spellChecker.check_words('our first correct sentence'))
if
__name__ == '__main__':
unittest.main
()Slide9
Let’s Find Some Bugs
import unittest
f
rom spellcheck
import
SpellChecker
class TestSpellChecker(unittest.TestCase):
def
setUp(self
):
self.spellChecker
=
SpellChecker()
self.spellChecker.load_words('spell.words
'
)
def test_spell_checker(self):
self.assertTrue(self.spellChecker.check_word('zygotic
'))
self.assertFalse(self.spellChecker.check_words('zygotic
mistasdas elementary'))
self.assertTrue(self.spellChecker.check_words('our first correct sentence'))
# handle case sensitivity
self.assertTrue(self.spellChecker.check_words(
'
Our
first correct sentence'))
#
handle
full stop
self.assertTrue(self.spellChecker.check_words('Our first
correct
sentence.'))
if
__name__ == '__main__':
unittest.main
()Slide10
Let’s Fix Some Bugs
class SpellChecker(object):
…
def
check_word(self,
word
):
# remove full stops and ensure lower case – 2 bugs fixed.
return
word
.strip(
'
.').lower()
in self.words
…Slide11
Handle Multiple Words Failing in a Sentence
import unittest
f
rom spellcheck import SpellChecker
class TestSpellChecker(unittest.TestCase):
def
setUp(self
):
self.spellChecker
=
SpellChecker()
self.spellChecker.load_words('spell.words
'
)
def test_spell_checker(self):
self.assertTrue(self.spellChecker.check_word('zygotic
'))
self.assertFalse(self.spellChecker.check_words('zygotic
mistasdas
elementary
'))
self.assertTrue(self.spellChecker.check_words('our first correct sentence'))
# handle case sensitivity
self.assertTrue(self.spellChecker.check_words(
'
Our
first correct sentence'))
#
handle
full stop
self.assertTrue(self.spellChecker.check_words('Our first
correct
sentence.'))
self.assertFalse(self.spellChecker.check_words
('zygotic
mistasdas
spelllleeeing elementary'))
…Slide12
Handle Multiple Failures in a Sentence Better
class
SpellChecker
(object
):
…
def
check_words
(self, sentence):
words_to_check
=
sentence.split
(
'
')
failed_words = []
for word in
words_to_check
:
if not
self.check_word
(word):
print(
'
Word is misspelt :
'
+ word)
failed_words.append(word)
return failed_words
…Slide13
Handle Multiple Failures in a Sentence Better
class
TestSpellChecker(unittest.TestCase):
…
def
test_spell_checker(self):
self.assertTrue(self.spellChecker.check_word('zygotic'))
failed_words = self.spellChecker.check_words('zygotic mistasdas elementary')
self.assertEquals(1, len(failed_words))
self.assertEquals('mistasdas', failed_words[0])
self.assertEquals(0, len(self.spellChecker.check_words('our first correct sentence')))
# handle case sensitivity
self.assertEquals(0, len(self.spellChecker.check_words('Our first correct sentence')))
# handle full stop
self.assertEquals(0, len(self.spellChecker.check_words('Our first correct sentence.')))
failed_words = self.spellChecker.check_words('zygotic mistasdas spelllleeeing elementary')
self.assertEquals(2, len(failed_words))
self.assertEquals('mistasdas', failed_words[0])
self.assertEquals('spelllleeeing',
failed_words[1
])Slide14
Handle a Document which is a list of Sentences
class
SpellChecker
(object
):
def
load_file(self, file_name):
lines = open(file_name).readlines()
return map(lambda x: x.strip(), lines)
def load_words(self, file_name):
self.words = self.load_file(file_name)
…
def
check_document(self, file_name):
self.sentences = self.load_file(file_name)
failed_words_in_sentences = []
index = 0
for sentence in self.sentences:
failed_words_in_sentences.extend(self.check_words(sentence, index))
index = index + 1
return
failed_words_in_sentences
…Slide15
Handle a Document which is a list of Sentences
class TestSpellChecker(unittest.TestCase):
…
def test_spell_checker(self):
self.assertTrue(self.spellChecker.check_word('zygotic'))
failed_words = self.spellChecker.check_words('zygotic mistasdas elementary')
self.assertEquals(1, len(failed_words))
self.assertEquals('mistasdas', failed_words[0])
self.assertEquals(0, len(self.spellChecker.check_words('our first correct sentence')))
# handle case sensitivity
self.assertEquals(0, len(self.spellChecker.check_words('Our first correct sentence')))
# handle full stop
self.assertEquals(0, len(self.spellChecker.check_words('Our first correct sentence.')))
failed_words = self.spellChecker.check_words('zygotic mistasdas spelllleeeing elementary')
self.assertEquals(2, len(failed_words))
self.assertEquals('mistasdas', failed_words[0])
self.assertEquals('spelllleeeing',
failed_words[1
])
#
more bugs because the spell checker doesn’t spell check itself correctly – 21 entries not correct – dictionary words need to be lower
self.assertEqual(21, len(self.spellChecker.check_document('spell.words')))Slide16
Handle lowering the case of the dictionary words
class
SpellChecker
(object
):
def
load_file(self, file_name):
lines = open(file_name).
readlines
()
# ensures that all items read become lower case.
return map(lambda x:
x.strip
()
.lower()
,
lines
)
…
class TestSpellChecker(unittest.TestCase):
…
def
test_spell_checker(self
):
…
# fix the 21 issues of words not matching the dictionary
self.assertEqual(0,
len(self.spellChecker.check_document('spell.words')))Slide17
Tracking failed words, line number and caret position
In order to calculate the mis-spelt word, the line number, and the caret position – we will append a dict into the list of failed words instead of just the word.
So:
failed_words.append(word)
Will become:
failed_words.append(
{'word
'
:word, 'line
'
: line_number, 'pos
'
: caret_position}
)
We will just need to keep track of the lines and the caret positions.
So in the check_document function – we sohuld use the enumerate function to give us the index in the list:
f
or sentence
in
self.sentences:
failed_words_in_sentences.extend(self.check_words(sentence))
Becomes
for
index,
sentence in
enumerate(
self.sentences
):
failed_words_in_sentences.extend(self.check_words(sentence,
index
))Slide18
Tracking failed words, line number and caret position
So the check_words function will become:
def
check_words(self, sentence
, index
):
words_to_check
= sentence.split(' ')
caret_position = 0
failed_words = []
for word in words_to_check:
if not self.check_word(word):
print('Word is misspelt ' + word + ' at line : ' + str(index+1) + ' pos ' + str(caret_position+1))
failed_words.append(
{'word':word,'line':index+1,'pos':
caret_position+1
}
)
# update the caret position to be the length of the word plus 1 for the split character.
caret_position = caret_position + len(word) + 1
return
failed_wordsSlide19
Tracking failed words, line number and caret position
So the check_words function will become:
#
index = 0 is set here so that the function can be called for one line and index defaults to 0
def check_words(self, sentence
, index = 0
):
words_to_check
= sentence.split(' ')
caret_position = 0
failed_words = []
for word in words_to_check:
if not self.check_word(word):
print('Word is misspelt ' + word + ' at line : ' + str(index+1) + ' pos ' + str(caret_position+1))
failed_words.append(
{'word':word,'line':index+1,'pos':
caret_position+1
}
)
# update the caret position to be the length of the word plus 1 for the split character.
caret_position = caret_position + len(word) + 1
return
failed_wordsSlide20
Updating the tests
So the check_words function will become:
def test_spell_checker(self):
self.assertTrue(self.spellChecker.check_word('zygotic'))
failed_words = self.spellChecker.check_words('zygotic mistasdas elementary')
self.assertEquals(1, len(failed_words))
self.assertEquals('mistasdas', failed_words[0]['word'])
self.assertEquals(1, failed_words[0]['line'])
self.assertEquals(9, failed_words[0]['pos'])
self.assertEquals(0, len(self.spellChecker.check_words('our first correct sentence')))
# handle case sensitivity
self.assertEquals(0, len(self.spellChecker.check_words('Our capital sentence')))
# handle full stop
self.assertEquals(0, len(self.spellChecker.check_words('Our full stop sentence.')))
failed_words = self.spellChecker.check_words('zygotic mistasdas spelllleeeing elementary')
self.assertEquals(2, len(failed_words))
self.assertEquals('mistasdas', failed_words[0]['word'])
self.assertEquals(1, failed_words[0]['line'])
self.assertEquals(9, failed_words[0]['pos'])
self.assertEquals('spelllleeeing', failed_words[1]['word'])
self.assertEquals(1, failed_words[1]['line'])
self.assertEquals(19, failed_words[1]['pos'])
self.assertEqual(0, len(self.spellChecker.check_document('spell.words')))Slide21
The Full Solution
Look at the files spellcheck.py and spellcheck_test.py
What next?
How
to spell check on a directory of files? – hint os.listdir
How to handle a different languages? Dict with list per language.