# by default, only the result of the last expression in a cell is displayed after evaluation.
# the following forces display of *all* self-standing expressions in a cell.

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"


import timeit

def lin_search(lst, x):
    for i in range(len(lst)):
        if lst[i] == x:
            return i
    raise ValueError(x)
    
def bin_search(lst, x):
    # assume that lst is sorted!!!
    low = 0
    hi  = len(lst)
    mid = (low + hi) // 2
    while lst[mid] != x and low <= hi:
        if lst[mid] < x:
            low = mid + 1
        else:
            hi  = mid - 1
        mid = (low + hi) // 2
    if lst[mid] == x:
        return mid
    else:
        raise ValueError(x)

def time_lin_search(size):
    return timeit.timeit('lin_search(lst, random.randrange({}))'.format(size), # interpolate size into randrange
                         'import random ; from __main__ import lin_search ;'
                         'lst = [x for x in range({})]'.format(size), # interpolate size into list range
                         number=100)

def time_bin_search(size):
    return timeit.timeit('bin_search(lst, random.randrange({}))'.format(size), # interpolate size into randrange
                         'import random ; from __main__ import bin_search ;'
                         'lst = [x for x in range({})]'.format(size), # interpolate size into list range
                         number=100)

def time_dict(size):
    return timeit.timeit('dct[random.randrange({})]'.format(size), 
                         'import random ; '
                         'dct = {{x: x for x in range({})}}'.format(size),
                         number=100)

lin_search_timings = [time_lin_search(n)
                      for n in range(10, 10000, 100)]

bin_search_timings = [time_bin_search(n)
                      for n in range(10, 10000, 100)]

dict_timings = [time_dict(n)
                for n in range(10, 10000, 100)]


%matplotlib inline
import matplotlib.pyplot as plt
plt.plot(lin_search_timings, 'ro')
plt.plot(bin_search_timings, 'gs')
plt.plot(dict_timings, 'b^')
plt.show()

[<matplotlib.lines.Line2D at 0x14ab8bf2970>]

[<matplotlib.lines.Line2D at 0x14ab8bf2e20>]

[<matplotlib.lines.Line2D at 0x14ab8c0e160>]


%matplotlib inline
import matplotlib.pyplot as plt
plt.plot(bin_search_timings, 'gs')
plt.plot(dict_timings, 'b^')
plt.show()

[<matplotlib.lines.Line2D at 0x14ab8cee040>]

[<matplotlib.lines.Line2D at 0x14ab8cee250>]


class LookupDS:    # NOT O(1) get and set
    def __init__(self):   # key value pairs are stored as 2 element list
        self.data = []
        self.count=0
        #   [ key, value]
        #      0     1
        # self.data[1]   accesses the 2nd element in the list
        # self.data[1][0]    accesses the  KEY of 2nd element in the list    
        # self.data[1][1] accesses the VALUE of 2nd element in the list
    
    def __setitem__(self, key, value):
        # set the dictionary[key] equal to the value   dct['superman']='clark kent'
        # if the key exists, replace the value   WALK THE self.data
        # if the key does not exist, add the [key,value] to the dictionary
        for i in range(self.count):
            if self.data[i][0] == key:
                self.data[i][1]= value
                break
        else:  # only excutes if for loop finishes normally
 #           self.data[self.count][0]=key
 #           self.data[self.count][1]=value
            self.data.append([key,value])
            self.count+=1
    
    def __getitem__(self, key):
# if the key exists, return the value   WALK THE self.data
        # if the key does not exist, raise KeyError
        for i in range(self.count):
            if self.data[i][0] == key:
                return self.data[i][1]
        else:  # only excutes if for loop finishes normally
            raise KeyError()
            
    def __contains__(self, key): #   if x in dict   boolean true/false
        try:
            x=self[key]    # self is a  LookupDS, call ing __getItem__
            #  x=self.__getItem__(key)
            return True
        except:
            return False


l = LookupDS()
l['batman'] = 'bruce wayne'
l['superman'] = 'clark kent'
l['spiderman'] = 'peter parker'


'batman' in l
'bateman' in l
l['pythonguy']='michael lee'
l['spiderman'] = '?????'

True

False


l['pythonguy']
l['spiderman']
l.data

'michael lee'

'?????'

[['batman', 'bruce wayne'],
 ['superman', 'clark kent'],
 ['spiderman', '?????'],
 ['pythonguy', 'michael lee']]


# to achieve O(1) lookup in an array we need an integer index
# l['pythonguy'] is not an  integer index
# convert the characters using ascii values, add them up, we have an int!
# what abotu possible collisions
# what is the range of possible sums of ascii values for strings?   infinite
# an arrays are finite!
hash('hello')    # hash is more complex calculation than just adding the ASCII values

-4357571102096799384


hash('hello')

-4357571102096799384


hash('batman')

8017273964607846977


hash('batmen')

8875827903189815927


hash((1,2,3))

529344067295497451


hash(37)

37


hash(37.5)

1152921504606847013


hash ([1,2,3])   # you cannot hash mutable data

---------------------------------------------------------------------------
TypeError                                 Traceback (most recent call last)
<ipython-input-22-fc514a49c0fa> in <module>
----> 1 hash ([1,2,3])

TypeError: unhashable type: 'list'


[hash(s) for s in ['different', 'objects', 'have', 'very', 'different', 'hashes']]

[-8140701765690394690,
 -7524609570572543266,
 -5506113723973798561,
 9206191555290154536,
 -8140701765690394690,
 -3723345771504888340]


[i%100 for i in range(10, 1000, 40)]   # between 0 and 99

[10,
 50,
 90,
 30,
 70,
 10,
 50,
 90,
 30,
 70,
 10,
 50,
 90,
 30,
 70,
 10,
 50,
 90,
 30,
 70,
 10,
 50,
 90,
 30,
 70]


[hash(s)%100 for s in ['different', 'objects', 'have', 'very', 'different', 'hashes']]

[10, 34, 39, 36, 10, 60]


#hash the key object
#then mod with the size of the array
# we get a integer index in the array
# problems:  two objects might hash to the same index   COLLISION
# even if different hashes, after we mod they might hit the same index  COLLISON


    class Hashtable:   # first attempt
    def __init__(self, n_buckets=1000):
        self.buckets = [None] * n_buckets    # pre-allocating a list of Nones
        # to the size of the hash table we think we need
        # we will NOT be growing  self.buckets  FIXED SIZE
        # n_buckets  will be used for the mod  0 to len(self.buckets)-1
        #  ??? % len(self.buckets)  yields 0 to len(self.buckets)-1
        
    def __setitem__(self, key, val):  #O(1)   ASSUMING HASH IS O(1)
     #  get the hash of the key
    #  idx= mod it with len(self.buckets)   
    #    self.bucket[idx] = val
    # note we are NOT storing the actual key  
    # idx is the hash of the key with mod applied
        idx = hash(key)%len(self.buckets) 
        self.buckets[idx]=val    # possible COLLISION ISSUES handled later 
    
    def __getitem__(self, key):   #O(1)
        idx = hash(key)%len(self.buckets) 
        if self.buckets[idx]:   # not a NONE
            return self.buckets[idx]
        else:
            raise KeyError()
    
    def __contains__(self, key):
        try:
            _ = self[key]   # calls get
            return True   
        except:
            return False


ht = Hashtable(10)
ht['batman'] = 'bruce wayne'
ht['superman'] = 'clark kent'
ht['spiderman'] = 'peter parker'


ht['batman']
ht['superman'] ="???"
ht['suparman']

'bruce wayne'

---------------------------------------------------------------------------
KeyError                                  Traceback (most recent call last)
<ipython-input-31-51b7ebbf25a2> in <module>
      1 ht['batman']
      2 ht['superman'] ="???"
----> 3 ht['suparman']

<ipython-input-29-87bd7878bbec> in __getitem__(self, key)
     21             return self.buckets[idx]
     22         else:
---> 23             raise KeyError()
     24 
     25     def __contains__(self, key):

KeyError:


ht.buckets

['???',
 None,
 None,
 None,
 'peter parker',
 None,
 None,
 'bruce wayne',
 None,
 None]


def birthday_p(n_people):
    p_inv = 1
    for n in range(365, 365-n_people, -1):
        p_inv *= n / 365
    return 1 - p_inv


birthday_p(2)

0.002739726027397249


1-364/365

0.002739726027397249


%matplotlib inline
import matplotlib.pyplot as plt

n_people = range(1, 80)
plt.plot(n_people, [birthday_p(n) for n in n_people])
plt.show()

[<matplotlib.lines.Line2D at 0x14ab8e18970>]


def collision_p(n_values, n_buckets):
    p_inv = 1
    for n in range(n_buckets, n_buckets-n_values, -1):
        p_inv *= n / n_buckets
    return 1 - p_inv


collision_p(23, 365) # same as birthday problem, for 23 people

0.5072972343239857


collision_p(10, 100)

0.37184349044470544


collision_p(100, 1000)

0.9940410733677595


collision_p(100, 10000)

0.3914340350427218


collision_p(100, 100000)

0.04831047413228129


# keeping number of values fixed at 100, but vary number of buckets: visualize probability of collision
%matplotlib inline
import matplotlib.pyplot as plt

n_buckets = range(100, 100001, 1000)
plt.plot(n_buckets, [collision_p(100, nb) for nb in n_buckets])
plt.show()

[<matplotlib.lines.Line2D at 0x14ab8e772e0>]


def avg_num_collisions(n, b):
    """Returns the expected number of collisions for n values uniformly distributed
    over a hashtable of b buckets. Based on (fairly) elementary probability theory.
    (Pay attention in MATH 474!)"""
    return n - b + b * (1 - 1/b)**n


avg_num_collisions(28, 365)

1.011442040700615


avg_num_collisions(1000, 1000)

367.6954247709637


avg_num_collisions(1000, 10000)

48.32893558556316


class Hashtable:
    class Node:
        def __init__(self, key, val, next=None):
            self.key = key
            self.val = val
            self.next = next
            
    def __init__(self, n_buckets=1000):
        self.buckets = [None] * n_buckets
        
    def __setitem__(self, key, val):     #  ht = HashTable()
                                        #  ht['superman']='clark kent'
        bucket_idx = hash(key) % len(self.buckets)
        # how do we insert (or update) a key value pair in the hashtable
        # check if self.buckets[idx]  is None, nothing is hashed to that position
        #  this key must be new, insert the node at self.buckets[idx] 
        # check if self.buckets[idx]  is NOT None,  Walk the linked lsit
        # at that  self.buckets[idx]   looking for the key
        #      if find key, replace the value in the node
        #     if not find key,make new node and put at end of the list
        # self.buckets[idx] contains a pointer to the collision linked list at [idx]
        n = self.buckets[bucket_idx]   # n is pointer to head of linked list(if n is not None, n is a Node)
        n_prior=None
        while n:
            if n.key == key:
                n.val = val
                return
            else:
                n_prior=n
                n=n.next
        
        # INSERT AT END OF LINKED LIST
        # when I exit this loop, I reached the end of the linked list  n=None
        # n_prior is pointer to the last node in the linked list
        if n_prior is not None:
            n_prior.next = Hashtable.Node(key, val, None)
        else:
            self.buckets[bucket_idx] = Hashtable.Node(key, val, None)
            
        # OR always insert at begining of linked list
        #self.buckets[bucket_idx] = HashTable.Node(key, val, self.buckets[bucket_idx])
    
    def __getitem__(self, key):
        bucket_idx = hash(key) % len(self.buckets)
        n = self.buckets[bucket_idx]   # n is pointer to head of linked list(if n is not None, n is a Node)
        while n:   # walk the collision linked list looking for the key
            if n.key == key:
                return n.val
            n=n.next
        else:
            raise KeyError()
    
    
    
    def __contains__(self, key):
        try:
            _ = self[key]    # this is calling __getitem__
            return True
        except:
            return False


ht = Hashtable(1)
ht['batman'] = 'bruce wayne'
ht['superman'] = 'clark kent'
ht['spiderman'] = 'peter parker'

ht['batman'] 
ht['superman']
ht['spiderman']

ht['batman']  = 'me'
ht['batman']

'bruce wayne'

'clark kent'

'peter parker'

'me'


ht.buckets[0].key
ht.buckets[0].val

ht.buckets[0].next.key
ht.buckets[0].next.val

ht.buckets[0].next.next.key
ht.buckets[0].next.next.val

'batman'

'me'

'superman'

'clark kent'

'spiderman'

'peter parker'


def prep_ht(size):
    ht = Hashtable(size*10)
    for x in range(size):
        ht[x] = x
    return ht

def time_ht(size):
    return timeit.timeit('ht[random.randrange({})]'.format(size), 
                         'import random ; from __main__ import prep_ht ;'
                         'ht = prep_ht({})'.format(size),
                         number=100)

ht_timings = [time_ht(n)
                for n in range(10, 10000, 100)]


%matplotlib inline
import matplotlib.pyplot as plt
plt.plot(bin_search_timings, 'ro')
plt.plot(ht_timings, 'gs')
plt.plot(dict_timings, 'b^')
plt.show()

[<matplotlib.lines.Line2D at 0x1a282792f70>]

[<matplotlib.lines.Line2D at 0x1a2829f2400>]

[<matplotlib.lines.Line2D at 0x1a2829f2790>]


class Hashtable(Hashtable):
    def __iter__(self):    # yields keys in the hashtable
        for bucket in self.buckets:
            n = bucket
            while n is not None:
                yield n.key
                n=n.next


ht = Hashtable(1)
ht['batman'] = 'bruce wayne'
ht['superman'] = 'clark kent'
ht['spiderman'] = 'peter parker'


for k in ht:
    print(k)

batman
superman
spiderman


ht = Hashtable(10)
ht['batman'] = 'bruce wayne'
ht['superman'] = 'clark kent'
ht['spiderman'] = 'peter parker'


for k in ht:   # demonstrates the randomness of hash inserts
    print(k)

spiderman
batman
superman


ht = Hashtable()
d = {}
for x in 'apple banana cat dog elephant'.split():
    d[x[0]] = x
    ht[x[0]] = x


for k in d:   # python dictionary does support iterating in the order of inserts
    print(k, '=>', d[k])

a => apple
b => banana
c => cat
d => dog
e => elephant


for k in ht:    # my hash table does not support iteration in order of insert
    print(k, '=>', ht[k])

a => apple
b => banana
d => dog
c => cat
e => elephant


hash(('two', 'strings'))


hash([1, 2, 3])


class Student:
    def __init__(self, fname, lname):
        self.fname = fname
        self.lname = lname


s = Student('John', 'Doe')
hash(s)


s.fname = 'Jane'
hash(s) # same as before mutation


class Student:
    def __init__(self, fname, lname):
        self.fname = fname
        self.lname = lname
        
    def __hash__(self):
        return hash(self.fname) + hash(self.lname)


s = Student('John', 'Doe')
hash(s)


s.fname = 'Jane'
hash(s)

Hashtables¶

Agenda¶

Discussion: pros/cons of array-backed and linked structures¶

Python's other built-in DS: the `dict`¶

Direct lookups via Hashing¶

Hashtables¶

On Collisions¶

The "Birthday Problem"¶

General collision statistics¶

Dealing with Collisions¶

Loose ends¶

Iteration¶

Key ordering¶

"Load factor" and Rehashing¶

Other APIs¶

Runtime analysis & Discussion¶

Vocabulary list¶

Addendum: On Hashability¶

Hashtables¶

Agenda¶

Discussion: pros/cons of array-backed and linked structures¶

Python's other built-in DS: the dict¶

Direct lookups via Hashing¶

Hashtables¶

On Collisions¶

The "Birthday Problem"¶

General collision statistics¶

Dealing with Collisions¶

Loose ends¶

Iteration¶

Key ordering¶

"Load factor" and Rehashing¶

Other APIs¶

Runtime analysis & Discussion¶

Vocabulary list¶

Addendum: On Hashability¶

Python's other built-in DS: the `dict`¶