follow-ups to the variance question

2013.Apr.24

In one part of yesterday’s interview question, we used sum and len.

Here’s some more information about these useful functions from __builtins__:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
# sum: can sum any Iterable of non-string elements

# including lists, where list summation is concatenation
assert sum([[1], [2], [3]], []) == [1,2,3]

# including complex objects like timedeltas
from datetime import timedelta
assert sum([timedelta(days=2), timedelta(days=3)], timedelta(days=0)) == timedelta(days=5)

# including user-defined classes with __{,r,i}add__ implemented
class Foo(object):
	def __init__(self, value):
		self.value = value
	def __add__(self, other):
		return type(self)(self.value + other.value)
	def __eq__(self, other):
		return self.value == other.value

class Bar(object):
	def __init__(self, value):
		self.value = value
	def __add__(self, other):
		return type(self)(self.value + other.value)
	def __eq__(self, other):
		return self.value == other.value

# including heterogeneous Iterables
assert sum([Foo(1), Bar(2), Bar(3)], Foo(0)) == Foo(6)
assert sum([Foo(1), Bar(2), Bar(3)], Bar(0)) == Bar(6)

# len: works on anything that has __len__ defined
class Foo(object):
	def __init__(self, value):
		self.value = value
	def __len__(self):
		return len(self.value)

assert len(Foo([1,2,3])) == 3

In the above example for sum why did I write type(self)(self.value + other.value) in __add__?

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
class Base(object):
	def __init__(self, value):
		self.value = value
	def __add__(self, other):
		return Base(self.value + other.value)

class Derived(Base):
	pass

assert type(Derived(1) + Derived(2)) is Base # is this our desired behaviour?

class Base(object):
	def __init__(self, value):
		self.value = value
	def __add__(self, other):
		return type(self)(self.value + other.value)

class Derived(Base):
	pass

assert type(Derived(1) + Derived(2)) is Derived 

# Base(self.value + other.value) means:
#   "explicitly return a new Base object with the summed values"
# type(self)(self.value + other.value) means:
#   "return the summed values wrapped in whatever type they came in"

When talking about list comprehensions, I asserted that they grow linearly in size (O(N) memory use):

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
from __future__ import division

# let's try to show that this is O(N) memory by
#   showing that increasing the number of elements by 1 increases
#   the size by a constant amount
# this demonstration is a bit long, because Python lists overallocate 

# count: the number of elements in the list
# size: the amount of memory the list uses

# calculate the count and size of a bunch of list comprehensions
#   of increasing count
from sys import getsizeof
sizes = [(len([x**2 for x in xrange(m) if x%2]),
          getsizeof([x**2 for x in xrange(m) if x%2])) for m in xrange(3,1000,2)]

# helper function to get pairs of values: [1,2,3,4] -> [(1,2), (2,3), (3,4)]
from itertools import izip, tee, islice
pairwise = lambda xs: izip(*(islice(x,i,None) for i,x in enumerate(tee(xs,2))))

# calculate the increases in list size per increase in list count
# this is the first derivative of size vs count
# without overallocation, size/count would be linear
increases = [(new_count-old_count, new_size-old_size)
             for (old_count,old_size),(new_count,new_size) in pairwise(sizes)]

# collapse the above to figure out where we are allocating
def collapse(xs):
	total_count_delta, total_size_delta = 0, 0
	for count_delta, size_delta in xs: 
		total_count_delta += count_delta
		total_size_delta += size_delta
		if size_delta != 0:
			yield total_count_delta, total_size_delta	
			total_count_delta, total_size_delta = 0, 0
allocations = list(collapse(increases))

# find the second derivative to try to ignore the effects of allocation
# O(N) refers to the first derivative of how the size of the list
#   changes with increasing count
# therefore, the second derivative should be a constant
second_deriv = [(new_size_delta-old_size_delta)/(new_count_delta-old_count_delta)
                  if (new_count_delta-old_count_delta) else 0
                for (old_count_delta,old_size_delta),(new_count_delta,new_size_delta)
                in pairwise(allocations)]

# this should average out to around the cost of a single int reference (PyObject*)
from sys import maxsize
from math import log
# guess the pointer size by:
#   1. start with the largest representable integer (sys.maxsize)
#   2. log-base-2 + 1 for the number of bits needed to represent it
#   3. divide by 8 to find number of bytes
pointer_size = (int(log(maxsize,2)) + 1)//8

# on my machine, running 64-bit Python:
#   assert pointer_size == 8 
assert round(sum(second_deriv)/len(second_deriv),0) == pointer_size

For reference, Python list allocation works as follows and should be amortised linear.

1
2
3
4
5
6
7
8
    /* This over-allocates proportional to the list size, making room
     * for additional growth.  The over-allocation is mild, but is
     * enough to give linear-time amortized behavior over a long
     * sequence of appends() in the presence of a poorly-performing
     * system realloc().
     * The growth pattern is:  0, 4, 8, 16, 25, 35, 46, 58, 72, 88, ...
     */
    new_allocated = (newsize >> 3) + (newsize < 9 ? 3 : 6);

This is much easier to show with generator expressions, which use constant memory:

1
2
3
4
5
6
7
8
# we can show this is constant in size very easily:
from sys import getsizeof
sizes = [getsizeof(x**2 for x in xrange(m) if x % 2) for m in xrange(3,1000,2)]

from itertools import izip, tee, islice
pairwise = lambda xs: izip(*(islice(x,i,None) for i,x in enumerate(tee(xs,2))))

assert all(x == y for x,y in pairwise(sizes)) # all the same size!