vipsprofile speed problems fixed

moved to O(n) layout
This commit is contained in:
John Cupitt 2013-12-18 11:10:13 +00:00
parent 2c5ee332f0
commit a9f85e1fd9
2 changed files with 81 additions and 115 deletions

24
TODO
View File

@ -3,30 +3,6 @@
- vipsprofile reports a leak, strangely
- vipsprofile performance is very poor for large data sets, eg.
time vips sharpen wtc.jpg x.jpg --radius 20 --vips-profile
recording profile in vips-profile.txt
real 0m14.728s
user 0m55.515s
sys 0m0.200s
john@bambam ~/pics $ vipsprofile
reading from vips-profile.txt
loaded 157716 events
last time = 14.584175
name alive wait% work% unkn% memory peakm
worker 20 14 2.22 95.8 2 22.7 22.7
worker 21 14 2.67 95.4 1.93 8.5 8.5
worker 22 14 2.95 95.2 1.84 17.8 17.8
worker 23 14 2.44 95.5 2.1 11.4 11.4
wbuffer 24 15 96 4.02 0.000654 0 0
wbuffer 25 15 95.4 4.62 0.000696 0 0
main 26 15 99.1 0 0.923 -37.4 6.87
peak memory = 67.3 MB
leak! final memory = 23 MB
positioning work/wait/mem ...
0% complete
- vipsprofile needs a man page for Debian, I guess
- new_heart.ws fails with libvips master

View File

@ -48,6 +48,9 @@ class Thread:
self.thread_name = thread_name
self.thread_number = Thread.thread_number
self.events = []
self.workwait = []
self.memory = []
self.other = []
Thread.thread_number += 1
all_events = []
@ -78,6 +81,12 @@ class Event:
thread.events.append(self)
all_events.append(self)
if self.wait or self.work:
thread.workwait.append(self)
elif self.memory:
thread.memory.append(self)
else:
thread.other.append(self)
input_filename = 'vips-profile.txt'
@ -134,6 +143,9 @@ with ReadFile(input_filename) as rf:
for thread in threads:
thread.events.sort(lambda x, y: cmp(x.start, y.start))
thread.workwait.sort(lambda x, y: cmp(x.start, y.start))
thread.memory.sort(lambda x, y: cmp(x.start, y.start))
thread.other.sort(lambda x, y: cmp(x.start, y.start))
all_events.sort(lambda x, y: cmp(x.start, y.start))
@ -141,17 +153,15 @@ print 'loaded %d events' % n_events
# move time axis to secs of computation
ticks_per_sec = 1000000.0
first_time = threads[0].events[0].start
first_time = all_events[0].start
last_time = 0
for thread in threads:
for event in thread.events:
for event in all_events:
if event.start < first_time:
first_time = event.start
if event.stop > last_time:
last_time = event.stop
for thread in threads:
for event in thread.events:
for event in all_events:
event.start = (event.start - first_time) / ticks_per_sec
event.stop = (event.stop - first_time) / ticks_per_sec
@ -166,8 +176,8 @@ for thread in threads:
thread.stop = 0
thread.wait = 0
thread.work = 0
thread.memory = 0
thread.peak_memory = 0
thread.mem = 0
thread.peak_mem = 0
for event in thread.events:
if event.start < thread.start:
thread.start = event.start
@ -178,9 +188,9 @@ for thread in threads:
if event.work:
thread.work += event.stop - event.start
if event.memory:
thread.memory += event.size
if thread.memory > thread.peak_memory:
thread.peak_memory = thread.memory
thread.mem += event.size
if thread.mem > thread.peak_mem:
thread.peak_mem = thread.mem
thread.alive = thread.stop - thread.start
@ -198,48 +208,47 @@ for thread in threads:
print '%13s\t%6.2g\t' % (thread.thread_name, thread.alive),
print '%.3g\t%.3g\t%.3g\t' % (wait_percent, work_percent, unkn_percent),
print '%.3g\t' % (float(thread.memory) / (1024 * 1024)),
print '%.3g\t' % (float(thread.peak_memory) / (1024 * 1024))
print '%.3g\t' % (float(thread.mem) / (1024 * 1024)),
print '%.3g\t' % (float(thread.peak_mem) / (1024 * 1024))
memory = 0
peak_memory = 0
mem = 0
peak_mem = 0
for event in all_events:
if event.memory:
memory += event.size
if memory > peak_memory:
peak_memory = memory
mem += event.size
if mem > peak_mem:
peak_mem = mem
print 'peak memory = %.3g MB' % (float(peak_memory) / (1024 * 1024))
if memory != 0:
print 'leak! final memory = %.3g MB' % (float(memory) / (1024 * 1024))
print 'peak memory = %.3g MB' % (float(peak_mem) / (1024 * 1024))
if mem != 0:
print 'leak! final memory = %.3g MB' % (float(mem) / (1024 * 1024))
# do two gates overlap?
def is_overlap(events, gate_name1, gate_name2):
for event1 in events:
if event1.gate_name != gate_name1:
# does a list of events contain an overlap?
# assume the list of events has been sorted by start time
def events_overlap(events):
for i in range(0, len(events) - 1):
# a length 0 event can't overlap with anything
if events[i].stop - events[i].start == 0:
continue
for event2 in events:
if event2.gate_name != gate_name2:
if events[i + 1].stop - events[i + 1].start == 0:
continue
# events are sorted by start time, so if we've gone past event1's
# stop time, we can give up
if event2.start > event1.stop:
break
# ... or if we're before event1's start
if event2.stop < event1.start:
continue
# if either endpoint of 1 is within 2
if event1.start > event2.start and event1.stop < event2.stop:
return True
if event1.stop > event2.start and event1.stop < event2.stop:
if events[i].stop > events[i + 1].start:
return True
return False
# do the events on two gates overlap?
def gates_overlap(events, gate_name1, gate_name2):
merged = []
for event in events:
if event.gate_name == gate_name1 or event.gate_name == gate_name2:
merged.append(event)
merged.sort(lambda x, y: cmp(x.start, y.start))
return events_overlap(merged)
# allocate a y position for each gate
total_y = 0
for thread in threads:
@ -248,49 +257,34 @@ for thread in threads:
thread.total_y = total_y
n_thread_events = len(thread.events)
if n_thread_events == 0:
continue
gate_positions = {}
# first pass .. move work and wait events to y == 0
print 'positioning work/wait/mem ...'
i = 0
gate_positions = {}
for event in thread.events:
i += 1
if i % (1 + n_thread_events / 100) == 0:
print '%d%% complete \r' % (100 * i / n_thread_events),
if not event.work and not event.wait and not event.memory:
continue
# works and waits must not overlap
if event.work or event.wait:
if not event.gate_name in gate_positions:
for gate_name in gate_positions:
if is_overlap(thread.events, event.gate_name, gate_name):
if events_overlap(thread.workwait):
print 'gate overlap on thread', thread.thread_name
print '\t', event.gate_location
print '\t', event.gate_name
print '\t', gate_name
break
for i in range(0, len(thread.workwait) - 1):
event1 = thread.workwait[i]
event2 = thread.workwait[i + 1]
if event1.stop > event2.start:
print 'overlap:'
print 'event', event1.gate_location, event1.gate_name,
print 'starts at', event1.start, 'stops at', event1.stop
print 'event', event2.gate_location, event2.gate_name,
print 'starts at', event2.start, 'stops at', event2.stop
for event in thread.workwait:
gate_positions[event.gate_name] = 0
event.y = 0
event.total_y = total_y
for event in thread.memory:
gate_positions[event.gate_name] = 0
event.y = 0
event.total_y = total_y
# second pass: move all other events to non-overlapping ys
print 'finding maximal sets of non-overlapping gates ...'
y = 1
i = 0
for event in thread.events:
i += 1
if i % (1 + n_thread_events / 100) == 0:
print '%d%% complete \r' % (100 * i / n_thread_events),
if event.work or event.wait or event.memory:
continue
for event in thread.other:
if not event.gate_name in gate_positions:
# look at all the ys we've allocated previously and see if we can
# add this gate to one of them
@ -300,7 +294,7 @@ for thread in threads:
if gate_positions[gate_name] != gate_y:
continue
if is_overlap(thread.events, event.gate_name, gate_name):
if gates_overlap(thread.other, event.gate_name, gate_name):
found_overlap = True
break
@ -317,11 +311,7 @@ for thread in threads:
# third pass: flip the order of the ys to get the lowest-level ones at the
# top, next to the wait/work line
print 'ordering timelines by granularity ...'
for event in thread.events:
if event.work or event.wait or event.memory:
continue
for event in thread.other:
event.y = y - event.y
event.total_y = total_y + event.y
@ -405,15 +395,15 @@ ctx.move_to(0, memory_y + theight + 8)
ctx.set_source_rgb(1.00, 1.00, 1.00)
ctx.show_text(label)
memory = 0
mem = 0
ctx.move_to(LEFT_BORDER, memory_y + MEM_HEIGHT)
for event in all_events:
if event.memory:
memory += event.size
mem += event.size
left = LEFT_BORDER + event.start * PIXELS_PER_SECOND
top = memory_y + MEM_HEIGHT - (MEM_HEIGHT * memory / peak_memory)
top = memory_y + MEM_HEIGHT - (MEM_HEIGHT * mem / peak_mem)
ctx.line_to(left, top)