libvips/tools/vipsprofile
John Cupitt 2c5ee332f0 make the buffer recycle list per image
so now recycle lists are short, scale with pipeline complexity, and
buffers are always appropriately sized for the image instead of being
slowly sized up to the max size for the pipeline

before:

$ vips sharpen k2.jpg x.jpg --radius 20
memory: high-water mark 38.99 MB

after:

$ vips sharpen k2.jpg x.jpg --radius 20
memory: high-water mark 29.46 MB
2013-12-18 09:54:26 +00:00

453 lines
13 KiB
Python

#!/usr/bin/python
import re
import math
import cairo
class ReadFile:
def __init__(self, filename):
self.filename = filename
def __enter__(self):
self.f = open(self.filename, 'r')
self.lineno = 0
self.getnext();
return self
def __exit__(self, type, value, traceback):
self.f.close()
def __nonzero__(self):
return self.line != ""
def getnext(self):
self.lineno += 1
self.line = self.f.readline()
def read_times(rf):
times = []
while True:
match = re.match('[+-]?[0-9]+ ', rf.line)
if not match:
break
times += [int(x) for x in re.split(' ', rf.line.rstrip())]
rf.getnext()
return times[::-1]
class Thread:
thread_number = 0
def __init__(self, thread_name):
# no one cares about the thread address
match = re.match('(.*) \(0x.*?\) (.*)', thread_name)
if match:
thread_name = match.group(1) + " " + match.group(2)
self.thread_name = thread_name
self.thread_number = Thread.thread_number
self.events = []
Thread.thread_number += 1
all_events = []
class Event:
def __init__(self, thread, gate_location, gate_name, start, stop):
self.thread = thread
self.gate_location = gate_location
self.gate_name = gate_name
self.work = False
self.wait = False
self.memory = False
if gate_location == "memory":
self.memory = True
elif re.match('.*work.*', gate_name):
self.work = True
elif re.match('.*wait.*', gate_name):
self.wait = True
if self.memory:
self.start = start
self.stop = start
self.size = stop
else:
self.start = start
self.stop = stop
thread.events.append(self)
all_events.append(self)
input_filename = 'vips-profile.txt'
thread_id = 0
threads = []
n_events = 0
print 'reading from', input_filename
with ReadFile(input_filename) as rf:
while rf:
if rf.line.rstrip() == "":
rf.getnext()
continue
if rf.line[0] == "#":
rf.getnext()
continue
match = re.match('thread: (.*)', rf.line)
if not match:
print 'parse error line %d, expected "thread"' % rf.lineno
thread_name = match.group(1) + " " + str(thread_id)
thread_id += 1
thread = Thread(thread_name)
threads.append(thread)
rf.getnext()
while True:
match = re.match('^gate: (.*?)(: (.*))?$', rf.line)
if not match:
break
gate_location = match.group(1)
gate_name = match.group(3)
rf.getnext()
match = re.match('start:', rf.line)
if not match:
continue
rf.getnext()
start = read_times(rf)
match = re.match('stop:', rf.line)
if not match:
continue
rf.getnext()
stop = read_times(rf)
if len(start) != len(stop):
print 'start and stop length mismatch'
for a, b in zip(start, stop):
Event(thread, gate_location, gate_name, a, b)
n_events += 1
for thread in threads:
thread.events.sort(lambda x, y: cmp(x.start, y.start))
all_events.sort(lambda x, y: cmp(x.start, y.start))
print 'loaded %d events' % n_events
# move time axis to secs of computation
ticks_per_sec = 1000000.0
first_time = threads[0].events[0].start
last_time = 0
for thread in threads:
for event in thread.events:
if event.start < first_time:
first_time = event.start
if event.stop > last_time:
last_time = event.stop
for thread in threads:
for event in thread.events:
event.start = (event.start - first_time) / ticks_per_sec
event.stop = (event.stop - first_time) / ticks_per_sec
last_time = (last_time - first_time) / ticks_per_sec
first_time = 0
print 'last time =', last_time
# calculate some simple stats
for thread in threads:
thread.start = last_time
thread.stop = 0
thread.wait = 0
thread.work = 0
thread.memory = 0
thread.peak_memory = 0
for event in thread.events:
if event.start < thread.start:
thread.start = event.start
if event.stop > thread.stop:
thread.stop = event.stop
if event.wait:
thread.wait += event.stop - event.start
if event.work:
thread.work += event.stop - event.start
if event.memory:
thread.memory += event.size
if thread.memory > thread.peak_memory:
thread.peak_memory = thread.memory
thread.alive = thread.stop - thread.start
# hide very short-lived threads
thread.hide = thread.alive < 0.01
print 'name\t\talive\twait%\twork%\tunkn%\tmemory\tpeakm'
for thread in threads:
if thread.hide:
continue
wait_percent = 100 * thread.wait / thread.alive
work_percent = 100 * thread.work / thread.alive
unkn_percent = 100 - 100 * (thread.work + thread.wait) / thread.alive
print '%13s\t%6.2g\t' % (thread.thread_name, thread.alive),
print '%.3g\t%.3g\t%.3g\t' % (wait_percent, work_percent, unkn_percent),
print '%.3g\t' % (float(thread.memory) / (1024 * 1024)),
print '%.3g\t' % (float(thread.peak_memory) / (1024 * 1024))
memory = 0
peak_memory = 0
for event in all_events:
if event.memory:
memory += event.size
if memory > peak_memory:
peak_memory = memory
print 'peak memory = %.3g MB' % (float(peak_memory) / (1024 * 1024))
if memory != 0:
print 'leak! final memory = %.3g MB' % (float(memory) / (1024 * 1024))
# do two gates overlap?
def is_overlap(events, gate_name1, gate_name2):
for event1 in events:
if event1.gate_name != gate_name1:
continue
for event2 in events:
if event2.gate_name != gate_name2:
continue
# events are sorted by start time, so if we've gone past event1's
# stop time, we can give up
if event2.start > event1.stop:
break
# ... or if we're before event1's start
if event2.stop < event1.start:
continue
# if either endpoint of 1 is within 2
if event1.start > event2.start and event1.stop < event2.stop:
return True
if event1.stop > event2.start and event1.stop < event2.stop:
return True
return False
# allocate a y position for each gate
total_y = 0
for thread in threads:
if thread.hide:
continue
thread.total_y = total_y
n_thread_events = len(thread.events)
if n_thread_events == 0:
continue
# first pass .. move work and wait events to y == 0
print 'positioning work/wait/mem ...'
i = 0
gate_positions = {}
for event in thread.events:
i += 1
if i % (1 + n_thread_events / 100) == 0:
print '%d%% complete \r' % (100 * i / n_thread_events),
if not event.work and not event.wait and not event.memory:
continue
# works and waits must not overlap
if event.work or event.wait:
if not event.gate_name in gate_positions:
for gate_name in gate_positions:
if is_overlap(thread.events, event.gate_name, gate_name):
print 'gate overlap on thread', thread.thread_name
print '\t', event.gate_location
print '\t', event.gate_name
print '\t', gate_name
break
gate_positions[event.gate_name] = 0
event.y = 0
event.total_y = total_y
# second pass: move all other events to non-overlapping ys
print 'finding maximal sets of non-overlapping gates ...'
y = 1
i = 0
for event in thread.events:
i += 1
if i % (1 + n_thread_events / 100) == 0:
print '%d%% complete \r' % (100 * i / n_thread_events),
if event.work or event.wait or event.memory:
continue
if not event.gate_name in gate_positions:
# look at all the ys we've allocated previously and see if we can
# add this gate to one of them
for gate_y in range(1, y):
found_overlap = False
for gate_name in gate_positions:
if gate_positions[gate_name] != gate_y:
continue
if is_overlap(thread.events, event.gate_name, gate_name):
found_overlap = True
break
if not found_overlap:
gate_positions[event.gate_name] = gate_y
break
# failure? add a new y
if not event.gate_name in gate_positions:
gate_positions[event.gate_name] = y
y += 1
event.y = gate_positions[event.gate_name]
# third pass: flip the order of the ys to get the lowest-level ones at the
# top, next to the wait/work line
print 'ordering timelines by granularity ...'
for event in thread.events:
if event.work or event.wait or event.memory:
continue
event.y = y - event.y
event.total_y = total_y + event.y
total_y += y
PIXELS_PER_SECOND = 1000
PIXELS_PER_GATE = 20
LEFT_BORDER = 130
BAR_HEIGHT = 5
MEM_HEIGHT = 100
WIDTH = int(LEFT_BORDER + last_time * PIXELS_PER_SECOND) + 20
HEIGHT = int(total_y * PIXELS_PER_GATE) + MEM_HEIGHT + 30
output_filename = "vips-profile.svg"
print 'writing to', output_filename
surface = cairo.SVGSurface(output_filename, WIDTH, HEIGHT)
ctx = cairo.Context(surface)
ctx.select_font_face('Sans')
ctx.set_font_size(15)
ctx.rectangle(0, 0, WIDTH, HEIGHT)
ctx.set_source_rgba(0.0, 0.0, 0.3, 1.0)
ctx.fill()
def draw_event(ctx, event):
left = event.start * PIXELS_PER_SECOND + LEFT_BORDER
top = event.total_y * PIXELS_PER_GATE + BAR_HEIGHT / 2
width = (event.stop - event.start) * PIXELS_PER_SECOND
height = BAR_HEIGHT
if event.memory:
width = 1
height /= 2
top += BAR_HEIGHT
ctx.rectangle(left, top, width, height)
if event.wait:
ctx.set_source_rgb(0.9, 0.1, 0.1)
elif event.work:
ctx.set_source_rgb(0.1, 0.9, 0.1)
elif event.memory:
ctx.set_source_rgb(1.0, 1.0, 1.0)
else:
ctx.set_source_rgb(0.1, 0.1, 0.9)
ctx.fill()
if not event.wait and not event.work and not event.memory:
xbearing, ybearing, twidth, theight, xadvance, yadvance = \
ctx.text_extents(event.gate_name)
ctx.move_to(left + width / 2 - twidth / 2, top + 3 * BAR_HEIGHT)
ctx.set_source_rgb(1.00, 0.83, 0.00)
ctx.show_text(event.gate_name)
for thread in threads:
if thread.hide:
continue
ctx.rectangle(0, thread.total_y * PIXELS_PER_GATE, WIDTH, 1)
ctx.set_source_rgb(1.00, 1.00, 1.00)
ctx.fill()
xbearing, ybearing, twidth, theight, xadvance, yadvance = \
ctx.text_extents(thread.thread_name)
ctx.move_to(0, theight + thread.total_y * PIXELS_PER_GATE + BAR_HEIGHT / 2)
ctx.set_source_rgb(1.00, 1.00, 1.00)
ctx.show_text(thread.thread_name)
for event in thread.events:
draw_event(ctx, event)
memory_y = total_y * PIXELS_PER_GATE
label = "memory"
xbearing, ybearing, twidth, theight, xadvance, yadvance = \
ctx.text_extents(label)
ctx.move_to(0, memory_y + theight + 8)
ctx.set_source_rgb(1.00, 1.00, 1.00)
ctx.show_text(label)
memory = 0
ctx.move_to(LEFT_BORDER, memory_y + MEM_HEIGHT)
for event in all_events:
if event.memory:
memory += event.size
left = LEFT_BORDER + event.start * PIXELS_PER_SECOND
top = memory_y + MEM_HEIGHT - (MEM_HEIGHT * memory / peak_memory)
ctx.line_to(left, top)
ctx.set_line_width(1)
ctx.set_source_rgb(1.00, 1.00, 1.00)
ctx.stroke()
axis_y = total_y * PIXELS_PER_GATE + MEM_HEIGHT
ctx.rectangle(LEFT_BORDER, axis_y, last_time * PIXELS_PER_SECOND, 1)
ctx.set_source_rgb(1.00, 1.00, 1.00)
ctx.fill()
label = "time"
xbearing, ybearing, twidth, theight, xadvance, yadvance = \
ctx.text_extents(label)
ctx.move_to(0, axis_y + theight + 8)
ctx.set_source_rgb(1.00, 1.00, 1.00)
ctx.show_text(label)
for t in range(0, int(last_time * PIXELS_PER_SECOND), PIXELS_PER_SECOND / 10):
left = t + LEFT_BORDER
top = axis_y
ctx.rectangle(left, top, 1, 5)
ctx.set_source_rgb(1.00, 1.00, 1.00)
ctx.fill()
label = str(float(t) / PIXELS_PER_SECOND)
xbearing, ybearing, twidth, theight, xadvance, yadvance = \
ctx.text_extents(label)
ctx.move_to(left - twidth / 2, top + theight + 8)
ctx.set_source_rgb(1.00, 1.00, 1.00)
ctx.show_text(label)
surface.finish()