Searching, Sorting, and Timing

Agenda

  1. Timing
  2. Prelude: Timing list indexing
  3. Linear search
  4. Binary search
  5. Insertion sort

1. Timing

In [1]:
import time
time.time()
Out[1]:
1473271526.532128
In [2]:
from time import time
time()
Out[2]:
1473271526.54549
In [3]:
from time import time, sleep
start = time()
sleep(1)
end = time()
print(end - start)
1.005336046218872
In [4]:
start = time()
sum(range(10**6))
end = time()
print(end - start)
0.029382944107055664
In [5]:
def timeit(f):
    start = time()
    f()
    end = time()
    return end - start
In [6]:
timeit(lambda: sum(range(10**6)))
Out[6]:
0.027124881744384766
In [7]:
def timeit(f):
    duration = 0
    for _ in range(10):
        start = time()
        f()
        end = time()
        duration += end - start
    return duration / 10
In [8]:
timeit(lambda: sum(range(10**6)))
Out[8]:
0.027736496925354005
In [9]:
import timeit
timeit.timeit('sum(range(10**6))', number=100)
Out[9]:
2.737777608000215
In [10]:
timeit.timeit(stmt='sorted(lst)', 
              setup='import random ; lst = [random.random() for _ in range(100)]',
              number=100)
Out[10]:
0.001243830000021262
In [11]:
import random
def to_time(lst):
    # do something interesting with the list
    pass

timeit.timeit(stmt='to_time(lst)',
              setup='lst = [random.random() for _ in range(100)]',
              globals=globals(), # allows timeit to access functions we've defined in this notebook
              number=1000)
Out[11]:
9.794600009627175e-05

2. Prelude: Timing list indexing

In [12]:
timeit.timeit(stmt='lst[0]',
              setup='import random; lst=[0] * 10**6')
Out[12]:
0.03286254399972677
In [13]:
timeit.timeit(stmt='lst[10**6-1]',
              setup='import random; lst=[0] * 10**6')
Out[13]:
0.032995364000271366
In [14]:
import random
size = 10**3
times = [0] * size
lst   = [0] * size
for _ in range(100):
    for i in range(size):
        times[i] += timeit.timeit(stmt='lst[{}]'.format(i),
                                  globals=globals(),
                                  number=10)
In [15]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.plot(times, 'ro')
plt.show()

Accessing an element in a list by index always takes the same amount of time, regardless of position. I.e., indexing incurs a constant time delay.

How? A Python list uses an array as its underlying data storage mechanism. To access an element in an array, the interpreter:

  1. Computes an offset into the array by multiplying the element's index by the size of each array entry (which are uniformly sized, since they are merely references to the actual elements)
  2. Adds the offset to the base address of the array

Task: to locate an element with a given value in a list (array).

In [16]:
def index(lst, x):
    for i in range(len(lst)):
        if lst[i] == x:
            return i
    return None
In [17]:
lst = list(range(100))
In [18]:
index(lst, 10)
Out[18]:
10
In [19]:
index(lst, 99)
Out[19]:
99
In [20]:
index(lst, -1)
In [21]:
def index(lst, x):
    for i in range(len(lst)):
        if lst[i] == x:
            return i
    raise ValueError(x)
In [22]:
index(lst, 10)
Out[22]:
10
In [23]:
index(lst, -1)
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-23-58f872d52287> in <module>()
----> 1 index(lst, -1)

<ipython-input-21-1cf466be2781> in index(lst, x)
      3         if lst[i] == x:
      4             return i
----> 5     raise ValueError(x)

ValueError: -1
In [24]:
try:
    print('Value found at', index(lst, -1))
except ValueError as e:
    print('Value not found:', e)
Value not found: -1
In [25]:
import timeit
times = []
lst = list(range(1000))
for x in lst:
    times.append(timeit.timeit(stmt='index(lst, {})'.format(x),
                               globals=globals(),
                               number=100))
In [26]:
import matplotlib.pyplot as plt
plt.plot(times, 'ro')
plt.show()
In [27]:
def index(lst, x):
    # assume that lst is sorted!!!
    low = 0
    hi  = len(lst)
    mid = (low + hi) // 2
    while lst[mid] != x and low <= hi:
        if lst[mid] < x:
            low = mid + 1
        else:
            hi  = mid - 1
        mid = (low + hi) // 2
    if lst[mid] == x:
        return mid
    else:
        raise ValueError(x)
In [28]:
lst = list(range(1000))
index(lst, 10)
Out[28]:
10
In [29]:
index(lst, 999)
Out[29]:
999
In [30]:
index(lst, -1)
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-30-58f872d52287> in <module>()
----> 1 index(lst, -1)

<ipython-input-27-017ffb928b46> in index(lst, x)
     13         return mid
     14     else:
---> 15         raise ValueError(x)

ValueError: -1
In [31]:
for i in range(len(lst)):
    assert(i == index(lst, i))
In [32]:
import timeit
times = []
lst = list(range(1000))
for x in lst:
    times.append(timeit.timeit(stmt='index(lst, {})'.format(x),
                               globals=globals(),
                               number=1000))
In [33]:
import matplotlib.pyplot as plt
plt.plot(times, 'ro')
plt.show()
In [34]:
def index(lst, x):
    # assume that lst is sorted!!!
    low = 0
    hi  = len(lst)
    mid = (low + hi) // 2
    while lst[mid] != x and low <= hi:
        if lst[mid] < x:
            low = mid + 1
        else:
            hi  = mid - 1
        mid = (low + hi) // 2
    if lst[mid] == x:
        return mid
    else:
        return None # for testing!
In [35]:
import timeit
import random
times = []
for size in range(100, 10000, 100):
    lst = list(range(size))
    times.append(timeit.timeit(stmt='index(lst, -1)'.format(random.randrange(size)),
                               globals=globals(),
                               number=10000))
In [36]:
import matplotlib.pyplot as plt
plt.plot(times, 'ro')
plt.show()
In [37]:
import timeit
import random
times = []
for e in range(5, 20):
    lst = list(range(2**e))
    times.append(timeit.timeit(stmt='index(lst, -1)',
                               globals=globals(),
                               number=100000))
In [38]:
import matplotlib.pyplot as plt
plt.plot(times, 'ro')
plt.show()

5. Insertion sort

In [39]:
import random
lst = list(range(1000))
random.shuffle(lst)
In [40]:
plt.plot(lst, 'ro')
plt.show()
In [41]:
def insertion_sort(lst):
    for i in range(1, len(lst)):
        for j in range(i, 0, -1):
            if lst[j-1] > lst[j]:
                lst[j-1], lst[j] = lst[j], lst[j-1] # swap
            else:
                break            
In [42]:
insertion_sort(lst)
In [43]:
plt.plot(lst, 'ro')
plt.show()
In [44]:
import timeit
import random
times = []
for size in range(100, 5000, 100):
    lst = list(range(size))
    times.append(timeit.timeit(stmt='insertion_sort(lst)',
                               setup='random.shuffle(lst)',
                               globals=globals(),
                               number=1))
In [45]:
plt.plot(times, 'ro')
plt.show()
In [ ]: