# by default, only the result of the last expression in a cell is displayed after evaluation.
# the following forces display of *all* self-standing expressions in a cell.
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
Between the array-backed and linked list we have:
dict¶import timeit
def lin_search(lst, x):
for i in range(len(lst)):
if lst[i] == x:
return i
raise ValueError(x)
def bin_search(lst, x):
# assume that lst is sorted!!!
low = 0
hi = len(lst)
mid = (low + hi) // 2
while lst[mid] != x and low <= hi:
if lst[mid] < x:
low = mid + 1
else:
hi = mid - 1
mid = (low + hi) // 2
if lst[mid] == x:
return mid
else:
raise ValueError(x)
def time_lin_search(size):
return timeit.timeit('lin_search(lst, random.randrange({}))'.format(size), # interpolate size into randrange
'import random ; from __main__ import lin_search ;'
'lst = [x for x in range({})]'.format(size), # interpolate size into list range
number=100)
def time_bin_search(size):
return timeit.timeit('bin_search(lst, random.randrange({}))'.format(size), # interpolate size into randrange
'import random ; from __main__ import bin_search ;'
'lst = [x for x in range({})]'.format(size), # interpolate size into list range
number=100)
def time_dict(size):
return timeit.timeit('dct[random.randrange({})]'.format(size),
'import random ; '
'dct = {{x: x for x in range({})}}'.format(size),
number=100)
lin_search_timings = [time_lin_search(n)
for n in range(10, 10000, 100)]
bin_search_timings = [time_bin_search(n)
for n in range(10, 10000, 100)]
dict_timings = [time_dict(n)
for n in range(10, 10000, 100)]
%matplotlib inline
import matplotlib.pyplot as plt
plt.plot(lin_search_timings, 'ro')
plt.plot(bin_search_timings, 'gs')
plt.plot(dict_timings, 'b^')
plt.show()
[<matplotlib.lines.Line2D at 0x14ab8bf2970>]
[<matplotlib.lines.Line2D at 0x14ab8bf2e20>]
[<matplotlib.lines.Line2D at 0x14ab8c0e160>]
%matplotlib inline
import matplotlib.pyplot as plt
plt.plot(bin_search_timings, 'gs')
plt.plot(dict_timings, 'b^')
plt.show()
[<matplotlib.lines.Line2D at 0x14ab8cee040>]
[<matplotlib.lines.Line2D at 0x14ab8cee250>]
class LookupDS: # NOT O(1) get and set
def __init__(self): # key value pairs are stored as 2 element list
self.data = []
self.count=0
# [ key, value]
# 0 1
# self.data[1] accesses the 2nd element in the list
# self.data[1][0] accesses the KEY of 2nd element in the list
# self.data[1][1] accesses the VALUE of 2nd element in the list
def __setitem__(self, key, value):
# set the dictionary[key] equal to the value dct['superman']='clark kent'
# if the key exists, replace the value WALK THE self.data
# if the key does not exist, add the [key,value] to the dictionary
for i in range(self.count):
if self.data[i][0] == key:
self.data[i][1]= value
break
else: # only excutes if for loop finishes normally
# self.data[self.count][0]=key
# self.data[self.count][1]=value
self.data.append([key,value])
self.count+=1
def __getitem__(self, key):
# if the key exists, return the value WALK THE self.data
# if the key does not exist, raise KeyError
for i in range(self.count):
if self.data[i][0] == key:
return self.data[i][1]
else: # only excutes if for loop finishes normally
raise KeyError()
def __contains__(self, key): # if x in dict boolean true/false
try:
x=self[key] # self is a LookupDS, call ing __getItem__
# x=self.__getItem__(key)
return True
except:
return False
l = LookupDS()
l['batman'] = 'bruce wayne'
l['superman'] = 'clark kent'
l['spiderman'] = 'peter parker'
'batman' in l
'bateman' in l
l['pythonguy']='michael lee'
l['spiderman'] = '?????'
True
False
l['pythonguy']
l['spiderman']
l.data
'michael lee'
'?????'
[['batman', 'bruce wayne'], ['superman', 'clark kent'], ['spiderman', '?????'], ['pythonguy', 'michael lee']]
Hashes (a.k.a. hash codes or hash values) are simply numerical values computed for objects.
# to achieve O(1) lookup in an array we need an integer index
# l['pythonguy'] is not an integer index
# convert the characters using ascii values, add them up, we have an int!
# what abotu possible collisions
# what is the range of possible sums of ascii values for strings? infinite
# an arrays are finite!
hash('hello') # hash is more complex calculation than just adding the ASCII values
-4357571102096799384
hash('hello')
-4357571102096799384
hash('batman')
8017273964607846977
hash('batmen')
8875827903189815927
hash((1,2,3))
529344067295497451
hash(37)
37
hash(37.5)
1152921504606847013
hash ([1,2,3]) # you cannot hash mutable data
--------------------------------------------------------------------------- TypeError Traceback (most recent call last) <ipython-input-22-fc514a49c0fa> in <module> ----> 1 hash ([1,2,3]) TypeError: unhashable type: 'list'
[hash(s) for s in ['different', 'objects', 'have', 'very', 'different', 'hashes']]
[-8140701765690394690, -7524609570572543266, -5506113723973798561, 9206191555290154536, -8140701765690394690, -3723345771504888340]
[i%100 for i in range(10, 1000, 40)] # between 0 and 99
[10, 50, 90, 30, 70, 10, 50, 90, 30, 70, 10, 50, 90, 30, 70, 10, 50, 90, 30, 70, 10, 50, 90, 30, 70]
[hash(s)%100 for s in ['different', 'objects', 'have', 'very', 'different', 'hashes']]
[10, 34, 39, 36, 10, 60]
#hash the key object
#then mod with the size of the array
# we get a integer index in the array
# problems: two objects might hash to the same index COLLISION
# even if different hashes, after we mod they might hit the same index COLLISON
class Hashtable: # first attempt
def __init__(self, n_buckets=1000):
self.buckets = [None] * n_buckets # pre-allocating a list of Nones
# to the size of the hash table we think we need
# we will NOT be growing self.buckets FIXED SIZE
# n_buckets will be used for the mod 0 to len(self.buckets)-1
# ??? % len(self.buckets) yields 0 to len(self.buckets)-1
def __setitem__(self, key, val): #O(1) ASSUMING HASH IS O(1)
# get the hash of the key
# idx= mod it with len(self.buckets)
# self.bucket[idx] = val
# note we are NOT storing the actual key
# idx is the hash of the key with mod applied
idx = hash(key)%len(self.buckets)
self.buckets[idx]=val # possible COLLISION ISSUES handled later
def __getitem__(self, key): #O(1)
idx = hash(key)%len(self.buckets)
if self.buckets[idx]: # not a NONE
return self.buckets[idx]
else:
raise KeyError()
def __contains__(self, key):
try:
_ = self[key] # calls get
return True
except:
return False
ht = Hashtable(10)
ht['batman'] = 'bruce wayne'
ht['superman'] = 'clark kent'
ht['spiderman'] = 'peter parker'
ht['batman']
ht['superman'] ="???"
ht['suparman']
'bruce wayne'
--------------------------------------------------------------------------- KeyError Traceback (most recent call last) <ipython-input-31-51b7ebbf25a2> in <module> 1 ht['batman'] 2 ht['superman'] ="???" ----> 3 ht['suparman'] <ipython-input-29-87bd7878bbec> in __getitem__(self, key) 21 return self.buckets[idx] 22 else: ---> 23 raise KeyError() 24 25 def __contains__(self, key): KeyError:
ht.buckets
['???', None, None, None, 'peter parker', None, None, 'bruce wayne', None, None]
Problem statement: Given $N$ people at a party, how likely is it that at least two people will have the same birthday?
def birthday_p(n_people):
p_inv = 1
for n in range(365, 365-n_people, -1):
p_inv *= n / 365
return 1 - p_inv
birthday_p(2)
0.002739726027397249
1-364/365
0.002739726027397249
%matplotlib inline
import matplotlib.pyplot as plt
n_people = range(1, 80)
plt.plot(n_people, [birthday_p(n) for n in n_people])
plt.show()
[<matplotlib.lines.Line2D at 0x14ab8e18970>]
Repeat the birthday problem, but with a given number of values and "buckets" that are allotted to hold them. How likely is it that two or more values will map to the same bucket?
def collision_p(n_values, n_buckets):
p_inv = 1
for n in range(n_buckets, n_buckets-n_values, -1):
p_inv *= n / n_buckets
return 1 - p_inv
collision_p(23, 365) # same as birthday problem, for 23 people
0.5072972343239857
collision_p(10, 100)
0.37184349044470544
collision_p(100, 1000)
0.9940410733677595
collision_p(100, 10000)
0.3914340350427218
collision_p(100, 100000)
0.04831047413228129
# keeping number of values fixed at 100, but vary number of buckets: visualize probability of collision
%matplotlib inline
import matplotlib.pyplot as plt
n_buckets = range(100, 100001, 1000)
plt.plot(n_buckets, [collision_p(100, nb) for nb in n_buckets])
plt.show()
[<matplotlib.lines.Line2D at 0x14ab8e772e0>]
def avg_num_collisions(n, b):
"""Returns the expected number of collisions for n values uniformly distributed
over a hashtable of b buckets. Based on (fairly) elementary probability theory.
(Pay attention in MATH 474!)"""
return n - b + b * (1 - 1/b)**n
avg_num_collisions(28, 365)
1.011442040700615
avg_num_collisions(1000, 1000)
367.6954247709637
avg_num_collisions(1000, 10000)
48.32893558556316
To deal with collisions in a hashtable, we simply create a "chain" of key/value pairs for each bucket where collisions occur. The chain needs to be a data structure that supports quick insertion — natural choice: the linked list!
class Hashtable:
class Node:
def __init__(self, key, val, next=None):
self.key = key
self.val = val
self.next = next
def __init__(self, n_buckets=1000):
self.buckets = [None] * n_buckets
def __setitem__(self, key, val): # ht = HashTable()
# ht['superman']='clark kent'
bucket_idx = hash(key) % len(self.buckets)
# how do we insert (or update) a key value pair in the hashtable
# check if self.buckets[idx] is None, nothing is hashed to that position
# this key must be new, insert the node at self.buckets[idx]
# check if self.buckets[idx] is NOT None, Walk the linked lsit
# at that self.buckets[idx] looking for the key
# if find key, replace the value in the node
# if not find key,make new node and put at end of the list
# self.buckets[idx] contains a pointer to the collision linked list at [idx]
n = self.buckets[bucket_idx] # n is pointer to head of linked list(if n is not None, n is a Node)
n_prior=None
while n:
if n.key == key:
n.val = val
return
else:
n_prior=n
n=n.next
# INSERT AT END OF LINKED LIST
# when I exit this loop, I reached the end of the linked list n=None
# n_prior is pointer to the last node in the linked list
if n_prior is not None:
n_prior.next = Hashtable.Node(key, val, None)
else:
self.buckets[bucket_idx] = Hashtable.Node(key, val, None)
# OR always insert at begining of linked list
#self.buckets[bucket_idx] = HashTable.Node(key, val, self.buckets[bucket_idx])
def __getitem__(self, key):
bucket_idx = hash(key) % len(self.buckets)
n = self.buckets[bucket_idx] # n is pointer to head of linked list(if n is not None, n is a Node)
while n: # walk the collision linked list looking for the key
if n.key == key:
return n.val
n=n.next
else:
raise KeyError()
def __contains__(self, key):
try:
_ = self[key] # this is calling __getitem__
return True
except:
return False
ht = Hashtable(1)
ht['batman'] = 'bruce wayne'
ht['superman'] = 'clark kent'
ht['spiderman'] = 'peter parker'
ht['batman']
ht['superman']
ht['spiderman']
ht['batman'] = 'me'
ht['batman']
'bruce wayne'
'clark kent'
'peter parker'
'me'
ht.buckets[0].key
ht.buckets[0].val
ht.buckets[0].next.key
ht.buckets[0].next.val
ht.buckets[0].next.next.key
ht.buckets[0].next.next.val
'batman'
'me'
'superman'
'clark kent'
'spiderman'
'peter parker'
def prep_ht(size):
ht = Hashtable(size*10)
for x in range(size):
ht[x] = x
return ht
def time_ht(size):
return timeit.timeit('ht[random.randrange({})]'.format(size),
'import random ; from __main__ import prep_ht ;'
'ht = prep_ht({})'.format(size),
number=100)
ht_timings = [time_ht(n)
for n in range(10, 10000, 100)]
%matplotlib inline
import matplotlib.pyplot as plt
plt.plot(bin_search_timings, 'ro')
plt.plot(ht_timings, 'gs')
plt.plot(dict_timings, 'b^')
plt.show()
[<matplotlib.lines.Line2D at 0x1a282792f70>]
[<matplotlib.lines.Line2D at 0x1a2829f2400>]
[<matplotlib.lines.Line2D at 0x1a2829f2790>]
class Hashtable(Hashtable):
def __iter__(self): # yields keys in the hashtable
for bucket in self.buckets:
n = bucket
while n is not None:
yield n.key
n=n.next
ht = Hashtable(1)
ht['batman'] = 'bruce wayne'
ht['superman'] = 'clark kent'
ht['spiderman'] = 'peter parker'
for k in ht:
print(k)
batman superman spiderman
ht = Hashtable(10)
ht['batman'] = 'bruce wayne'
ht['superman'] = 'clark kent'
ht['spiderman'] = 'peter parker'
for k in ht: # demonstrates the randomness of hash inserts
print(k)
spiderman batman superman
ht = Hashtable()
d = {}
for x in 'apple banana cat dog elephant'.split():
d[x[0]] = x
ht[x[0]] = x
for k in d: # python dictionary does support iterating in the order of inserts
print(k, '=>', d[k])
a => apple b => banana c => cat d => dog e => elephant
for k in ht: # my hash table does not support iteration in order of insert
print(k, '=>', ht[k])
a => apple b => banana d => dog c => cat e => elephant
It doesn't often make sense to start with a large number of buckets, unless we know in advance that the number of keys is going to be vast — also, the user of the hashtable would typically prefer to not be bothered with implementation details (i.e., bucket count) when using the data structure.
Instead: start with a relatively small number of buckets, and if the ratio of keys to the number of buckets (known as the load factor) is above some desired threshold — which we can determine using collision probabilities — we can dynamically increase the number of buckets. This requires, however, that we rehash all keys and potentially move them into new buckets (since the hash(key) % num_buckets mapping will likely be different with more buckets).
__setitem__ (to update value for existing key)__delitem__keys & values (return iterators for keys and values)setdefaultFor a hashtable with $N$ key/value entries:
Remember: a given object must always hash to the same value. This is required so that we can always map the object to the same hash bucket.
Hashcodes for collections of objects are usually computed from the hashcodes of its contents, e.g., the hash of a tuple is a function of the hashes of the objects in said tuple:
hash(('two', 'strings'))
This is useful. It allows us to use a tuple, for instance, as a key for a hashtable.
However, if the collection of objects is mutable — i.e., we can alter its contents — this means that we can potentially change its hashcode.`
If we were to use such a collection as a key in a hashtable, and alter the collection after it's been assigned to a particular bucket, this leads to a serious problem: the collection may now be in the wrong bucket (as it was assigned to a bucket based on its original hashcode)!
For this reason, only immutable types are, by default, hashable in Python. So while we can use integers, strings, and tuples as keys in dictionaries, lists (which are mutable) cannot be used. Indeed, Python marks built-in mutable types as "unhashable", e.g.,
hash([1, 2, 3])
That said, Python does support hashing on instances of custom classes (which are mutable). This is because the default hash function implementation does not rely on the contents of instances of custom classes. E.g.,
class Student:
def __init__(self, fname, lname):
self.fname = fname
self.lname = lname
s = Student('John', 'Doe')
hash(s)
s.fname = 'Jane'
hash(s) # same as before mutation
We can change the default behavior by providing our own hash function in __hash__, e.g.,
class Student:
def __init__(self, fname, lname):
self.fname = fname
self.lname = lname
def __hash__(self):
return hash(self.fname) + hash(self.lname)
s = Student('John', 'Doe')
hash(s)
s.fname = 'Jane'
hash(s)
But be careful: instances of this class are no longer suitable for use as keys in hashtables (or dictionaries), if you intend to mutate them after using them as keys!