import urllib.request
md_url = 'https://www.gutenberg.org/files/2701/2701-0.txt'
md_text = urllib.request.urlopen(md_url).read().decode()
idx = md_text.index('Call me Ishmael')
idx
md_text[idx:idx+100]
md_text[idx:idx+100].split()
md_words = md_text.lower().split()
len(md_words)
md_words_uniq = set(md_words)
len(md_words_uniq)
# compute the frequency of each word in the text
md_word_counts = {}
for w in md_words:
if w not in md_word_counts:
md_word_counts[w] = 1
else:
md_word_counts[w] += 1
md_word_counts['the']
md_word_counts['moby']
len(md_word_counts)
list(md_word_counts.items())[:10]
md_word_counts_sorted = sorted(md_word_counts.items(), key=lambda t: t[1], reverse=True)
md_word_counts_sorted[:10]
import urllib.request
sw_url = 'https://moss.cs.iit.edu/stopwords.txt'
sw_text = urllib.request.urlopen(sw_url).read().decode()
stopwords = sw_text.split()
md_word_counts = { k: md_word_counts[k] for k in set(md_word_counts) - set(stopwords)}
md_word_counts_sorted = sorted(md_word_counts.items(), key=lambda t: t[1], reverse=True)
md_word_counts_sorted[:10]
%matplotlib inline
import matplotlib.pyplot as plt
# need lists of the first n words/counts to plot
n = 10
words = [p[0] for p in md_word_counts_sorted[:n]]
counts = [p[1] for p in md_word_counts_sorted[:n]]
plt.rcParams['figure.figsize'] = [12, 5]
plt.bar(range(n), counts)
plt.xticks(range(n), words, rotation=60, fontsize=12)
plt.show()
# collect all two-word phrases as tuples
phrases = []
for i in range(len(md_words)-1):
phrases.append(md_words[i:i+2])
phrases[:10]
# map each word to a list of all the words that
# follow it in the text
phrase_dict = {}
for (w1, w2) in phrases:
if w1 not in phrase_dict:
phrase_dict[w1] = [w2]
else:
phrase_dict[w1].append(w2)
phrase_dict['starboard']
# generate a sentence based on two-word phrase statistics
# from Moby Dick
import random
gen_words = ['whale']
for _ in range(10):
next_words = phrase_dict[gen_words[-1]]
gen_words.append(random.choice(next_words))
' '.join(gen_words)
puzzle = '''..3 .2. 6..
9.. 3.5 ..1
..1 8.6 4..
..8 1.2 9..
7.. ... ..8
..6 7.8 2..
..2 6.9 5..
8.. 2.3 ..9
..5 .1. 3..
'''
# name all the squares in the game
rows = 'ABCDEFGHI'
cols = '123456789'
squares = [r+c for r in rows for c in cols]
def parse_puzzle(puz_str):
puzzle = [c if c in '123456789' else None
for c in puz_str if c not in ' \n']
return {squares[i]: puzzle[i]
for i in range(0, len(squares))}
parse_puzzle(puzzle)
# what squares fall in the same columns? (list of lists)
vert_units = [[r+c for c in cols] for r in rows]
vert_units
# what squares fall in the same rows? (list of lists)
horiz_units = [[r+c for r in rows] for c in cols]
horiz_units
# what squares fall in the same "boxes"? (list of lists)
box_units = [[r+c for r in rs for c in cs]
for rs in ('ABC', 'DEF', 'GHI')
for cs in ('123', '456', '789')]
box_units
all_units = vert_units + horiz_units + box_units
# associate each square with a list of all the units it belongs to
units = {s: [u for u in all_units if s in u] for s in squares}
units['A1']
# associate each square with the set of all its "peers" (i.e., all
# other squares that fall into one of its units)
peers = {s: set([sq for u in units[s] for sq in u]) - {s}
for s in squares}
peers['A1']
# what is a "catch-all" solution?
sol = {s: '123456789' for s in squares}
sol
def assign(sol, sq, val):
'''If assigning a value to a square, eliminate all other
values from that square in the solution.'''
for other in sol[sq].replace(val, ''):
eliminate(sol, sq, other)
def eliminate(sol, sq, val):
'''Eliminate a value from a square. Also check whether
eliminating a value from the given square narrows down
that value to only one affected square; if so, assign
the value to that square.'''
if val not in sol[sq]:
return
sol[sq] = sol[sq].replace(val, '')
if len(sol[sq]) == 1:
last = sol[sq][0]
for p in peers[sq]:
eliminate(sol, p, last)
for u in units[sq]:
candidates = [s for s in u if val in sol[s]]
if len(candidates) == 1:
assign(sol, candidates[0], val)
def solve_puzzle(puzz_str):
puz = parse_puzzle(puzz_str)
sol = {s: '123456789' for s in squares}
for sq, val in puz.items():
if val:
assign(sol, sq, val)
return sol
solve_puzzle(puzzle)
def print_sol(sol):
for r in rows:
for c in cols:
print('{:^6} '.format(sol[r+c]), end='')
print()
print_sol(solve_puzzle(puzzle))
# NYTimes "Medium" puzzle
print_sol(solve_puzzle('''
.51 8.. 3.6
.3. ... ...
..9 .42 .15
..4 .75 ...
3.. ... ...
.8. 9.. ...
... ... 8..
.1. ..6 .9.
..7 ... ..4
'''))