/
Programming For Big Data Programming For Big Data

Programming For Big Data - PowerPoint Presentation

tatiana-dople
tatiana-dople . @tatiana-dople
Follow
390 views
Uploaded On 2017-08-18

Programming For Big Data - PPT Presentation

Darren Redmond Developing Out Spell Checking Capabilities Based on the tutorial below and well see where we end up in class http wwwopenbookprojectnetpy4funspellCheck Create a file called ID: 579989

check words spellchecker word words check word spellchecker failed sentence assertequals def zygotic spell file len mistasdas return caret

Share:

Link:

Embed:

Download Presentation from below link

Download Presentation The PPT/PDF document "Programming For Big Data" is the property of its rightful owner. Permission is granted to download and print the materials on this web site for personal, non-commercial use only, and to display it on your personal computer provided you do not modify the materials and that you retain all copyright notices contained in the materials. By downloading content from our website, you accept the terms of this agreement.


Presentation Transcript

Slide1

Programming For Big Data

Darren RedmondSlide2

Developing Out Spell Checking Capabilities

Based on the tutorial below – and we’ll see where we end up in

class

http

://www.openbookproject.net/py4fun/spellCheck

/

Create a file called

spellcheck.py

with the following:

# to begin we start with a list of words in a file called

spell.words

# we read the file and strip out the file endings

words

= open

('

spell.words

'

).

readlines

()

words = map(lambda

x:

x.strip

(),

words)

# now check if the word zygotic is a word

p

rint('zygotic

' in

words)

p

ython

spellcheck.pySlide3

Create Functions For Reuse

#

create a function to load the words and to check a word is in the

dictionary

def

load_words

(

file_name

):

words = open(

file_name

).

readlines

()

words = map(lambda x:

x.strip

(), words)

return words

def

check_word

(words, word

):

return word in words

w

ords =

load_words

('

spell.words

')

# now check if the word zygotic is a word

print(

check_word

(words, 'zygotic'))

p

ython

spellcheck.pySlide4

Create

Function To Check A Sentence

def

check_words

(words, sentence):

words_to_check

=

sentence.split

(' ')

for word in

words_to_check

:

if not

check_word

(words, word):

print(

'

Word is misspelt :

'

+ word)

return False

return True

words =

load_words

('

spell.words

')

p

rint(

check_word

(words, 'zygotic'))

p

rint(

check_words

(words, 'zygotic

mistasdas

elementary'))Slide5

The Full Program Now

def

load_words

(

file_name

):

words = open(

file_name

).

readlines

()

words = map(lambda x:

x.strip

(), words

)

return words

def

check_word

(words, word

):

return word in

words

d

ef

check_words

(words, sentence):

words_to_check

=

sentence.split

(

'

')

for word in

words_to_check

:

if not

check_word

(words, word):

print(

'

Word is misspelt :

'

+ word)

return False

return True

w

ords =

load_words

('

spell.words

')

p

rint(

check_word

(words, 'zygotic'))

p

rint(

check_words

(words, 'zygotic

mistasdas

elementary'))Slide6

Create Spell Checker Class

class

SpellChecker

(object

):

def

__

init

__(self):

self.words

=

[]

def

load_words(self, file_name

):

self.words

= open(

file_name

).

readlines

()

self.words

= map(lambda x:

x.strip

(),

self.words

)

def

check_word

(self, word

):

return

word in

self.words

def

check_words

(self, sentence):

words_to_check

=

sentence.split

(

'

')

for word in

words_to_check

:

if not

self.check_word

(word):

print(

'

Word is misspelt :

'

+ word)

return False

return TrueSlide7

Use the Spell Checker Class

# enable so that this is only called when the script run from the command line

if __name__ == '__main__':

spellChecker

= SpellChecker()

spellChecker.load_words

('spell.words')

print(spellChecker.check_word

('zygotic'))

print(spellChecker.check_words

('zygotic mistasdas elementary'))Slide8

Let’s Create Some Unit Tests – spellcheck_test.py

import

unittest

f

rom spellcheck import SpellChecker

class TestSpellChecker(unittest.TestCase):

def

setUp(self

):

self.spellChecker

=

SpellChecker()

self.spellChecker.load_words('spell.words

'

)

def test_spell_checker(self):

self.assertTrue(self.spellChecker.check_word('zygotic

'))

self.assertFalse(self.spellChecker.check_words('zygotic

mistasdas elementary'))

self.assertTrue(self.spellChecker.check_words('our first correct sentence'))

if

__name__ == '__main__':

unittest.main

()Slide9

Let’s Find Some Bugs

import unittest

f

rom spellcheck

import

SpellChecker

class TestSpellChecker(unittest.TestCase):

def

setUp(self

):

self.spellChecker

=

SpellChecker()

self.spellChecker.load_words('spell.words

'

)

def test_spell_checker(self):

self.assertTrue(self.spellChecker.check_word('zygotic

'))

self.assertFalse(self.spellChecker.check_words('zygotic

mistasdas elementary'))

self.assertTrue(self.spellChecker.check_words('our first correct sentence'))

# handle case sensitivity

self.assertTrue(self.spellChecker.check_words(

'

Our

first correct sentence'))

#

handle

full stop

self.assertTrue(self.spellChecker.check_words('Our first

correct

sentence.'))

if

__name__ == '__main__':

unittest.main

()Slide10

Let’s Fix Some Bugs

class SpellChecker(object):

def

check_word(self,

word

):

# remove full stops and ensure lower case – 2 bugs fixed.

return

word

.strip(

'

.').lower()

in self.words

…Slide11

Handle Multiple Words Failing in a Sentence

import unittest

f

rom spellcheck import SpellChecker

class TestSpellChecker(unittest.TestCase):

def

setUp(self

):

self.spellChecker

=

SpellChecker()

self.spellChecker.load_words('spell.words

'

)

def test_spell_checker(self):

self.assertTrue(self.spellChecker.check_word('zygotic

'))

self.assertFalse(self.spellChecker.check_words('zygotic

mistasdas

elementary

'))

self.assertTrue(self.spellChecker.check_words('our first correct sentence'))

# handle case sensitivity

self.assertTrue(self.spellChecker.check_words(

'

Our

first correct sentence'))

#

handle

full stop

self.assertTrue(self.spellChecker.check_words('Our first

correct

sentence.'))

self.assertFalse(self.spellChecker.check_words

('zygotic

mistasdas

spelllleeeing elementary'))

…Slide12

Handle Multiple Failures in a Sentence Better

class

SpellChecker

(object

):

def

check_words

(self, sentence):

words_to_check

=

sentence.split

(

'

')

failed_words = []

for word in

words_to_check

:

if not

self.check_word

(word):

print(

'

Word is misspelt :

'

+ word)

failed_words.append(word)

return failed_words

…Slide13

Handle Multiple Failures in a Sentence Better

class

TestSpellChecker(unittest.TestCase):

def

test_spell_checker(self):

self.assertTrue(self.spellChecker.check_word('zygotic'))

failed_words = self.spellChecker.check_words('zygotic mistasdas elementary')

self.assertEquals(1, len(failed_words))

self.assertEquals('mistasdas', failed_words[0])

self.assertEquals(0, len(self.spellChecker.check_words('our first correct sentence')))

# handle case sensitivity

self.assertEquals(0, len(self.spellChecker.check_words('Our first correct sentence')))

# handle full stop

self.assertEquals(0, len(self.spellChecker.check_words('Our first correct sentence.')))

failed_words = self.spellChecker.check_words('zygotic mistasdas spelllleeeing elementary')

self.assertEquals(2, len(failed_words))

self.assertEquals('mistasdas', failed_words[0])

self.assertEquals('spelllleeeing',

failed_words[1

])Slide14

Handle a Document which is a list of Sentences

class

SpellChecker

(object

):

def

load_file(self, file_name):

lines = open(file_name).readlines()

return map(lambda x: x.strip(), lines)

def load_words(self, file_name):

self.words = self.load_file(file_name)

def

check_document(self, file_name):

self.sentences = self.load_file(file_name)

failed_words_in_sentences = []

index = 0

for sentence in self.sentences:

failed_words_in_sentences.extend(self.check_words(sentence, index))

index = index + 1

return

failed_words_in_sentences

…Slide15

Handle a Document which is a list of Sentences

class TestSpellChecker(unittest.TestCase):

def test_spell_checker(self):

self.assertTrue(self.spellChecker.check_word('zygotic'))

failed_words = self.spellChecker.check_words('zygotic mistasdas elementary')

self.assertEquals(1, len(failed_words))

self.assertEquals('mistasdas', failed_words[0])

self.assertEquals(0, len(self.spellChecker.check_words('our first correct sentence')))

# handle case sensitivity

self.assertEquals(0, len(self.spellChecker.check_words('Our first correct sentence')))

# handle full stop

self.assertEquals(0, len(self.spellChecker.check_words('Our first correct sentence.')))

failed_words = self.spellChecker.check_words('zygotic mistasdas spelllleeeing elementary')

self.assertEquals(2, len(failed_words))

self.assertEquals('mistasdas', failed_words[0])

self.assertEquals('spelllleeeing',

failed_words[1

])

#

more bugs because the spell checker doesn’t spell check itself correctly – 21 entries not correct – dictionary words need to be lower

self.assertEqual(21, len(self.spellChecker.check_document('spell.words')))Slide16

Handle lowering the case of the dictionary words

class

SpellChecker

(object

):

def

load_file(self, file_name):

lines = open(file_name).

readlines

()

# ensures that all items read become lower case.

return map(lambda x:

x.strip

()

.lower()

,

lines

)

class TestSpellChecker(unittest.TestCase):

def

test_spell_checker(self

):

# fix the 21 issues of words not matching the dictionary

self.assertEqual(0,

len(self.spellChecker.check_document('spell.words')))Slide17

Tracking failed words, line number and caret position

In order to calculate the mis-spelt word, the line number, and the caret position – we will append a dict into the list of failed words instead of just the word.

So:

failed_words.append(word)

Will become:

failed_words.append(

{'word

'

:word, 'line

'

: line_number, 'pos

'

: caret_position}

)

We will just need to keep track of the lines and the caret positions.

So in the check_document function – we sohuld use the enumerate function to give us the index in the list:

f

or sentence

in

self.sentences:

failed_words_in_sentences.extend(self.check_words(sentence))

Becomes

for

index,

sentence in

enumerate(

self.sentences

):

failed_words_in_sentences.extend(self.check_words(sentence,

index

))Slide18

Tracking failed words, line number and caret position

So the check_words function will become:

def

check_words(self, sentence

, index

):

words_to_check

= sentence.split(' ')

caret_position = 0

failed_words = []

for word in words_to_check:

if not self.check_word(word):

print('Word is misspelt ' + word + ' at line : ' + str(index+1) + ' pos ' + str(caret_position+1))

failed_words.append(

{'word':word,'line':index+1,'pos':

caret_position+1

}

)

# update the caret position to be the length of the word plus 1 for the split character.

caret_position = caret_position + len(word) + 1

return

failed_wordsSlide19

Tracking failed words, line number and caret position

So the check_words function will become:

#

index = 0 is set here so that the function can be called for one line and index defaults to 0

def check_words(self, sentence

, index = 0

):

words_to_check

= sentence.split(' ')

caret_position = 0

failed_words = []

for word in words_to_check:

if not self.check_word(word):

print('Word is misspelt ' + word + ' at line : ' + str(index+1) + ' pos ' + str(caret_position+1))

failed_words.append(

{'word':word,'line':index+1,'pos':

caret_position+1

}

)

# update the caret position to be the length of the word plus 1 for the split character.

caret_position = caret_position + len(word) + 1

return

failed_wordsSlide20

Updating the tests

So the check_words function will become:

def test_spell_checker(self):

self.assertTrue(self.spellChecker.check_word('zygotic'))

failed_words = self.spellChecker.check_words('zygotic mistasdas elementary')

self.assertEquals(1, len(failed_words))

self.assertEquals('mistasdas', failed_words[0]['word'])

self.assertEquals(1, failed_words[0]['line'])

self.assertEquals(9, failed_words[0]['pos'])

self.assertEquals(0, len(self.spellChecker.check_words('our first correct sentence')))

# handle case sensitivity

self.assertEquals(0, len(self.spellChecker.check_words('Our capital sentence')))

# handle full stop

self.assertEquals(0, len(self.spellChecker.check_words('Our full stop sentence.')))

failed_words = self.spellChecker.check_words('zygotic mistasdas spelllleeeing elementary')

self.assertEquals(2, len(failed_words))

self.assertEquals('mistasdas', failed_words[0]['word'])

self.assertEquals(1, failed_words[0]['line'])

self.assertEquals(9, failed_words[0]['pos'])

self.assertEquals('spelllleeeing', failed_words[1]['word'])

self.assertEquals(1, failed_words[1]['line'])

self.assertEquals(19, failed_words[1]['pos'])

self.assertEqual(0, len(self.spellChecker.check_document('spell.words')))Slide21

The Full Solution

Look at the files spellcheck.py and spellcheck_test.py

What next?

How

to spell check on a directory of files? – hint os.listdir

How to handle a different languages? Dict with list per language.