# by default, only the result of the last expression in a cell is displayed after evaluation.
# the following forces display of *all* self-standing expressions in a cell.
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
Between the array-backed and linked list we have:
dict
¶import timeit
def lin_search(lst, x): #O(n)
for i in range(len(lst)):
if lst[i] == x:
return i
raise ValueError(x)
def bin_search(lst, x): #O(lg N)
# assume that lst is sorted!!!
low = 0
hi = len(lst)
mid = (low + hi) // 2
while lst[mid] != x and low <= hi:
if lst[mid] < x:
low = mid + 1
else:
hi = mid - 1
mid = (low + hi) // 2
if lst[mid] == x:
return mid
else:
raise ValueError(x)
def time_lin_search(size):
return timeit.timeit('lin_search(lst, random.randrange({}))'.format(size), # interpolate size into randrange
'import random ; from __main__ import lin_search ;'
'lst = [x for x in range({})]'.format(size), # interpolate size into list range
number=100)
def time_bin_search(size):
return timeit.timeit('bin_search(lst, random.randrange({}))'.format(size), # interpolate size into randrange
'import random ; from __main__ import bin_search ;'
'lst = [x for x in range({})]'.format(size), # interpolate size into list range
number=100)
def time_dict(size):
return timeit.timeit('dct[random.randrange({})]'.format(size),
'import random ; '
'dct = {{x: x for x in range({})}}'.format(size),
number=100)
lin_search_timings = [time_lin_search(n)
for n in range(10, 10000, 100)]
bin_search_timings = [time_bin_search(n)
for n in range(10, 10000, 100)]
dict_timings = [time_dict(n)
for n in range(10, 10000, 100)]
%matplotlib inline
import matplotlib.pyplot as plt
plt.plot(lin_search_timings, 'ro')
plt.plot(bin_search_timings, 'gs')
plt.plot(dict_timings, 'b^')
plt.show()
[<matplotlib.lines.Line2D at 0x142a2894970>]
[<matplotlib.lines.Line2D at 0x142a2894e20>]
[<matplotlib.lines.Line2D at 0x142a28b1160>]
%matplotlib inline
import matplotlib.pyplot as plt
#plt.plot(lin_search_timings, 'ro')
plt.plot(bin_search_timings, 'gs')
plt.plot(dict_timings, 'b^')
plt.show()
[<matplotlib.lines.Line2D at 0x142a2985940>]
[<matplotlib.lines.Line2D at 0x142a2985d30>]
class LookupDS: # write our own dictionary(map)
# support key value pairs
def __init__(self): # constructor LookupDS
self.data = [] #empty list, able to use python automatic list grow
# this list will contain elements that are 2 element lists, [key, value]
# 0 1
#O(n)
def __setitem__(self, key, value): #supports d[key]=value
# must support both a new key and a key that already exists
# search self.data[][0] iterate over self.data
# start at index 0, while key not found (and not end), keep looking
# if find, set the vlaue
# if not found (exit the loop) , add the key value pair
i=0
while i<len(self.data) and self.data[i][0] != key:
i+=1
if i==len(self.data):
self.data.append([key, value]) # let python handle list growth
else:
self.data[i][1]=value;
#O(n)
def __getitem__(self, key): #supports d[key] return the value
# must support key that already exists and raise exception if not there
#search self.data[][0] iterate over self.data
for i in range(len(self.data)):
if self.data[i][0] == key:
return self.data[i][1]
# if leave the loop without returning, it is a not found
raise KeyError()
def __contains__(self, key):
try:
x=self.__getitem__(key)
return True
except:
return False
l = LookupDS()
l['batman'] = 'bruce wayne'
l['superman'] = 'clark kent'
l['spiderman'] = 'peter parker'
l['spiderman']
l['batman']
l['pythonman']
'peter parker'
'bruce wayne'
--------------------------------------------------------------------------- KeyError Traceback (most recent call last) <ipython-input-7-56a97f4fc49f> in <module> 1 l['spiderman'] 2 l['batman'] ----> 3 l['pythonman'] <ipython-input-5-05be0b117919> in __getitem__(self, key) 29 return self.data[i][1] 30 # if leave the loop without returning, it is a not found ---> 31 raise KeyError() 32 33 def __contains__(self, key): KeyError:
'batman' in l
'pythonman' in l
l['batman']= 'michael keaton'
l['pythonman']= 'michael Lee'
l['batman']
l['pythonman']
True
False
'michael keaton'
'michael Lee'
l.data
[['batman', 'michael keaton'], ['superman', 'clark kent'], ['spiderman', 'peter parker'], ['pythonman', 'michael Lee']]
Hashes (a.k.a. hash codes or hash values) are simply numerical values computed for objects.
hash('hello') # hash works on immutable objects, and is repeatable
-1815644428330184742
hash('batman')
-6447852574822505115
hash('batmen')
-9097705683731746874
hash('batmen')
-9097705683731746874
[hash(s) for s in ['different', 'objects', 'have', 'very', 'different', 'hashes']]
[8406822390216371770, 1579821317360321670, 8970529510659063439, 8945551818786971727, 8406822390216371770, -2666044671640180942]
# part way to translate a string to a numeric index
[i%100 for i in range(10, 1000, 40)]
[10, 50, 90, 30, 70, 10, 50, 90, 30, 70, 10, 50, 90, 30, 70, 10, 50, 90, 30, 70, 10, 50, 90, 30, 70]
# assume 100 is size of my array, indexes are 0 to 99
# assume the has function is O(1), it does not matter how large of an object I hash
[hash(s)%100 for s in ['different', 'objects', 'have', 'very', 'different', 'hashes']]
[70, 70, 39, 27, 70, 58]
[hash(s)%1000 for s in ['different', 'objects', 'have', 'very', 'different', 'hashes']]
[770, 670, 439, 727, 770, 58]
class Hashtable:
def __init__(self, n_buckets=1000):
#utilize hash(key)%len(self.buckets) gets us an index
self.buckets = [None] * n_buckets # list of size n_buckets filled with Nones
# we will not let Python grow this list (array) automatically. WHY NOT?
# hash table cannot grow unless you re-hash everything with new % length
def __setitem__(self, key, val): # if hash is O(1) and no collision, set is O(1)
# generate the index hash(key)%len(self.buckets)
# put the val at index position in self.buckets
bucket_index = hash(key)%len(self.buckets)
self.buckets[bucket_index] = val # for now we are not storing the key
# ignore collisions
def __getitem__(self, key):
bucket_index = hash(key)%len(self.buckets)
if self.buckets[bucket_index] == None:
raise KeyError
else:
return self.buckets[bucket_index]
def __contains__(self, key):
try:
_ = self[key]
return True
except:
return False
ht = Hashtable(10)
ht['batman'] = 'bruce wayne'
ht['superman'] = 'clark kent'
ht['spiderman'] = 'peter parker'
ht['superman']
ht['pythonman']
'clark kent'
--------------------------------------------------------------------------- KeyError Traceback (most recent call last) <ipython-input-23-b557976902e7> in <module> 1 ht['superman'] ----> 2 ht['pythonman'] <ipython-input-21-190c2a2e3186> in __getitem__(self, key) 16 bucket_index = hash(key)%len(self.buckets) 17 if self.buckets[bucket_index] == None: ---> 18 raise KeyError 19 else: 20 return self.buckets[bucket_index] KeyError:
ht['superman']='matt'
ht['pythonman']='bob'
ht['superman']
ht['pythonman']
'matt'
'bob'
'superman' in ht
'mom' in ht
True
False
ht.buckets
['bob', 'matt', None, None, None, 'bruce wayne', None, None, None, 'peter parker']
ht1 = Hashtable(5)
ht1['batman'] = 'bruce wayne'
ht1['superman'] = 'clark kent'
ht1['spiderman'] = 'peter parker'
ht1['pythonman'] = 'michael lee' #collision with batman
ht1.buckets
['michael lee', 'clark kent', None, None, 'peter parker']
Problem statement: Given $N$ people at a party, how likely is it that at least two people will have the same birthday?
def birthday_p(n_people):
p_inv = 1
for n in range(365, 365-n_people, -1):
p_inv *= n / 365
return 1 - p_inv
birthday_p(2)
0.002739726027397249
1-364/365
0.002739726027397249
%matplotlib inline
import matplotlib.pyplot as plt
n_people = range(1, 80)
plt.plot(n_people, [birthday_p(n) for n in n_people])
plt.show()
[<matplotlib.lines.Line2D at 0x142a2abf130>]
Repeat the birthday problem, but with a given number of values and "buckets" that are allotted to hold them. How likely is it that two or more values will map to the same bucket?
def collision_p(n_values, n_buckets):
p_inv = 1
for n in range(n_buckets, n_buckets-n_values, -1):
p_inv *= n / n_buckets
return 1 - p_inv
collision_p(23, 365) # same as birthday problem, for 23 people
0.5072972343239857
collision_p(10, 100)
0.37184349044470544
collision_p(100, 1000)
0.9940410733677595
# keeping number of values fixed at 100, but vary number of buckets: visualize probability of collision
%matplotlib inline
import matplotlib.pyplot as plt
n_buckets = range(100, 100001, 1000)
plt.plot(n_buckets, [collision_p(100, nb) for nb in n_buckets])
plt.show()
[<matplotlib.lines.Line2D at 0x142a2b14820>]
def avg_num_collisions(n, b):
"""Returns the expected number of collisions for n values uniformly distributed
over a hashtable of b buckets. Based on (fairly) elementary probability theory.
(Pay attention in MATH 474!)"""
return n - b + b * (1 - 1/b)**n
avg_num_collisions(28, 365)
1.011442040700615
avg_num_collisions(1000, 1000)
367.6954247709637
avg_num_collisions(1000, 10000)
48.32893558556316
To deal with collisions in a hashtable, we simply create a "chain" of key/value pairs for each bucket where collisions occur. The chain needs to be a data structure that supports quick insertion — natural choice: the linked list!
class Hashtable:
class Node:
def __init__(self, key, val, next=None):
self.key = key
self.val = val
self.next = next
def __init__(self, n_buckets=1000):
self.buckets = [None] * n_buckets
def __setitem__(self, key, val):
bucket_idx = hash(key) % len(self.buckets)
# look at the linked list at that bucket
# if find key, replace the val
#if not find the key, prepend a new node at front of linked list at the bucket
# buckets[bucket_idx] this is either None or the head of a linked list
n = self.buckets[bucket_idx] # pointer to walk the linked list
while n and n.key!=key:
n=n.next
if not n: # we did not find the key in the linkced list , then prepend
self.buckets[bucket_idx]=Hashtable.Node(key, val, self.buckets[bucket_idx] )
else: # if found key, replace the value
n.val=val
def __getitem__(self, key):
bucket_idx = hash(key) % len(self.buckets)
n = self.buckets[bucket_idx]
while n and n.key!=key: # if find key, return the value
# if not find key, raise key error
n=n.next
if not n:
raise KeyError
else:
return n.val
def __contains__(self, key):
try:
_ = self[key]
return True
except:
return False
def __iter__(self):
for i in range(len(self.buckets)):
n = self.buckets[i]
while n: # walk linked list at the bucket
yield (i, n.key, n.val)
n=n.next
ht = Hashtable(1)
ht['batman'] = 'bruce wayne'
ht['superman'] = 'clark kent'
ht['spiderman'] = 'peter parker'
ht['batman']
ht['superman']
ht['spiderman']
'bruce wayne'
'clark kent'
'peter parker'
for x in ht:
print(x)
(0, 'spiderman', 'peter parker') (0, 'superman', 'clark kent') (0, 'batman', 'bruce wayne')
def prep_ht(size):
ht = Hashtable(size*10)
for x in range(size):
ht[x] = x
return ht
def time_ht(size):
return timeit.timeit('ht[random.randrange({})]'.format(size),
'import random ; from __main__ import prep_ht ;'
'ht = prep_ht({})'.format(size),
number=100)
ht_timings = [time_ht(n)
for n in range(10, 10000, 100)]
%matplotlib inline
import matplotlib.pyplot as plt
plt.plot(bin_search_timings, 'ro')
plt.plot(ht_timings, 'gs')
plt.plot(dict_timings, 'b^')
plt.show()
[<matplotlib.lines.Line2D at 0x20669de90a0>]
[<matplotlib.lines.Line2D at 0x20669de94f0>]
[<matplotlib.lines.Line2D at 0x20669de9880>]
class Hashtable(Hashtable):
def __iter__(self):
for i in range(len(self.buckets)):
n = self.buckets[i]
while n: # walk linked list at the bucket
yield n.key
n=n.next
ht = Hashtable(1)
ht['batman'] = 'bruce wayne'
ht['superman'] = 'clark kent'
ht['spiderman'] = 'peter parker'
for k in ht:
print(k)
spiderman superman batman
ht = Hashtable()
d = {}
for x in 'apple banana cat dog elephant'.split():
d[x[0]] = x
ht[x[0]] = x
for k in d:
print(k, '=>', d[k]) # ordered by insert order (python dict)
a => apple b => banana c => cat d => dog e => elephant
for k in ht:
print(k, '=>', ht[k]) #ordered in the order we entered items (my hashtbable)
d => dog b => banana a => apple e => elephant c => cat
It doesn't often make sense to start with a large number of buckets, unless we know in advance that the number of keys is going to be vast — also, the user of the hashtable would typically prefer to not be bothered with implementation details (i.e., bucket count) when using the data structure.
Instead: start with a relatively small number of buckets, and if the ratio of keys to the number of buckets (known as the load factor) is above some desired threshold — which we can determine using collision probabilities — we can dynamically increase the number of buckets. This requires, however, that we rehash all keys and potentially move them into new buckets (since the hash(key) % num_buckets
mapping will likely be different with more buckets).
__setitem__
(to update value for existing key)__delitem__
keys
& values
(return iterators for keys and values)setdefault
For a hashtable with $N$ key/value entries:
Remember: a given object must always hash to the same value. This is required so that we can always map the object to the same hash bucket.
Hashcodes for collections of objects are usually computed from the hashcodes of its contents, e.g., the hash of a tuple is a function of the hashes of the objects in said tuple:
hash(('two', 'strings'))
This is useful. It allows us to use a tuple, for instance, as a key for a hashtable.
However, if the collection of objects is mutable — i.e., we can alter its contents — this means that we can potentially change its hashcode.`
If we were to use such a collection as a key in a hashtable, and alter the collection after it's been assigned to a particular bucket, this leads to a serious problem: the collection may now be in the wrong bucket (as it was assigned to a bucket based on its original hashcode)!
For this reason, only immutable types are, by default, hashable in Python. So while we can use integers, strings, and tuples as keys in dictionaries, lists (which are mutable) cannot be used. Indeed, Python marks built-in mutable types as "unhashable", e.g.,
hash([1, 2, 3])
That said, Python does support hashing on instances of custom classes (which are mutable). This is because the default hash function implementation does not rely on the contents of instances of custom classes. E.g.,
class Student:
def __init__(self, fname, lname):
self.fname = fname
self.lname = lname
s = Student('John', 'Doe')
hash(s)
s.fname = 'Jane'
hash(s) # same as before mutation
We can change the default behavior by providing our own hash function in __hash__
, e.g.,
class Student:
def __init__(self, fname, lname):
self.fname = fname
self.lname = lname
def __hash__(self):
return hash(self.fname) + hash(self.lname)
s = Student('John', 'Doe')
hash(s)
-1504933712619484690
s.fname = 'Jane'
hash(s)
1246817324432521191
hash([1,2])
--------------------------------------------------------------------------- TypeError Traceback (most recent call last) <ipython-input-28-9ce67481a686> in <module> ----> 1 hash([1,2]) TypeError: unhashable type: 'list'
But be careful: instances of this class are no longer suitable for use as keys in hashtables (or dictionaries), if you intend to mutate them after using them as keys!