# follow-ups to the variance question

## 2013.Apr.24

In one part of yesterday’s interview question, we used `sum`

and `len`

.

Here’s some more information about these useful functions from `__builtins__`

:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38

# sum: can sum any Iterable of non-string elements
# including lists, where list summation is concatenation
assert sum([[1], [2], [3]], []) == [1,2,3]
# including complex objects like timedeltas
from datetime import timedelta
assert sum([timedelta(days=2), timedelta(days=3)], timedelta(days=0)) == timedelta(days=5)
# including user-defined classes with __{,r,i}add__ implemented
class Foo(object):
def __init__(self, value):
self.value = value
def __add__(self, other):
return type(self)(self.value + other.value)
def __eq__(self, other):
return self.value == other.value
class Bar(object):
def __init__(self, value):
self.value = value
def __add__(self, other):
return type(self)(self.value + other.value)
def __eq__(self, other):
return self.value == other.value
# including heterogeneous Iterables
assert sum([Foo(1), Bar(2), Bar(3)], Foo(0)) == Foo(6)
assert sum([Foo(1), Bar(2), Bar(3)], Bar(0)) == Bar(6)
# len: works on anything that has __len__ defined
class Foo(object):
def __init__(self, value):
self.value = value
def __len__(self):
return len(self.value)
assert len(Foo([1,2,3])) == 3

In the above example for `sum`

why did I write `type(self)(self.value + other.value)`

in `__add__`

?

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26

class Base(object):
def __init__(self, value):
self.value = value
def __add__(self, other):
return Base(self.value + other.value)
class Derived(Base):
pass
assert type(Derived(1) + Derived(2)) is Base # is this our desired behaviour?
class Base(object):
def __init__(self, value):
self.value = value
def __add__(self, other):
return type(self)(self.value + other.value)
class Derived(Base):
pass
assert type(Derived(1) + Derived(2)) is Derived
# Base(self.value + other.value) means:
# "explicitly return a new Base object with the summed values"
# type(self)(self.value + other.value) means:
# "return the summed values wrapped in whatever type they came in"

When talking about list comprehensions, I asserted that they grow linearly in size (O(N) memory use):

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58

from __future__ import division
# let's try to show that this is O(N) memory by
# showing that increasing the number of elements by 1 increases
# the size by a constant amount
# this demonstration is a bit long, because Python lists overallocate
# count: the number of elements in the list
# size: the amount of memory the list uses
# calculate the count and size of a bunch of list comprehensions
# of increasing count
from sys import getsizeof
sizes = [(len([x**2 for x in xrange(m) if x%2]),
getsizeof([x**2 for x in xrange(m) if x%2])) for m in xrange(3,1000,2)]
# helper function to get pairs of values: [1,2,3,4] -> [(1,2), (2,3), (3,4)]
from itertools import izip, tee, islice
pairwise = lambda xs: izip(*(islice(x,i,None) for i,x in enumerate(tee(xs,2))))
# calculate the increases in list size per increase in list count
# this is the first derivative of size vs count
# without overallocation, size/count would be linear
increases = [(new_count-old_count, new_size-old_size)
for (old_count,old_size),(new_count,new_size) in pairwise(sizes)]
# collapse the above to figure out where we are allocating
def collapse(xs):
total_count_delta, total_size_delta = 0, 0
for count_delta, size_delta in xs:
total_count_delta += count_delta
total_size_delta += size_delta
if size_delta != 0:
yield total_count_delta, total_size_delta
total_count_delta, total_size_delta = 0, 0
allocations = list(collapse(increases))
# find the second derivative to try to ignore the effects of allocation
# O(N) refers to the first derivative of how the size of the list
# changes with increasing count
# therefore, the second derivative should be a constant
second_deriv = [(new_size_delta-old_size_delta)/(new_count_delta-old_count_delta)
if (new_count_delta-old_count_delta) else 0
for (old_count_delta,old_size_delta),(new_count_delta,new_size_delta)
in pairwise(allocations)]
# this should average out to around the cost of a single int reference (PyObject*)
from sys import maxsize
from math import log
# guess the pointer size by:
# 1. start with the largest representable integer (sys.maxsize)
# 2. log-base-2 + 1 for the number of bits needed to represent it
# 3. divide by 8 to find number of bytes
pointer_size = (int(log(maxsize,2)) + 1)//8
# on my machine, running 64-bit Python:
# assert pointer_size == 8
assert round(sum(second_deriv)/len(second_deriv),0) == pointer_size

For reference, Python list allocation works as follows and should be amortised linear.

1
2
3
4
5
6
7
8

/* This over-allocates proportional to the list size, making room
* for additional growth. The over-allocation is mild, but is
* enough to give linear-time amortized behavior over a long
* sequence of appends() in the presence of a poorly-performing
* system realloc().
* The growth pattern is: 0, 4, 8, 16, 25, 35, 46, 58, 72, 88, ...
*/
new_allocated = (newsize >> 3) + (newsize < 9 ? 3 : 6);

This is much easier to show with generator expressions, which use constant memory:

1
2
3
4
5
6
7
8

# we can show this is constant in size very easily:
from sys import getsizeof
sizes = [getsizeof(x**2 for x in xrange(m) if x % 2) for m in xrange(3,1000,2)]
from itertools import izip, tee, islice
pairwise = lambda xs: izip(*(islice(x,i,None) for i,x in enumerate(tee(xs,2))))
assert all(x == y for x,y in pairwise(sizes)) # all the same size!