diff --git a/ChangeLog b/ChangeLog
index 48f79014..389fc632 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -24,7 +24,7 @@
 - add --strip option to jpegsave
 - added vips_gaussblur() convenience function
 - added --vips-profile, records and dumps thread timing info 
-- added vipsprofile.py, visualises --vips-profile output
+- added vipsprofile, visualises --vips-profile output
 
 20/11/13	started 7.36.5
 - better cache sizing in unbuffered sequential mode
diff --git a/TODO b/TODO
index ae3d2f13..3b339cb2 100644
--- a/TODO
+++ b/TODO
@@ -4,34 +4,6 @@
 
   investigate
 
-- parse and graph vips-profile data 
-
-- we're spending a lot of time on threading
-
-	$ time vips sharpen x.v x2.v
-	real	0m6.925s
-	user	0m18.240s
-	sys	0m0.996s
-	$ time vips sharpen x.v x2.v --vips-concurrency=1
-	real	0m12.831s
-	user	0m10.016s
-	sys	0m0.480s
-
-  almost 50% of CPU! ouch
-
-  could be just overcalculation on overlaps? do we get more overcalculation
-  with more threads?
-
-  valgrind finds no interesting functions in the profile for vips sharpen,
-  just 
-
-  	Lab2XYZ			20% 
-	conv_gen		15% 
-	extract_band		8%
-	etc. 
-
-  nor any difference between 1 thread and n thread runs
-
 - vips_gaussblur() should switch to float prec if given a float image?
 
   same for vips_conv()?
diff --git a/tools/vipsprofile b/tools/vipsprofile
index 3192243a..94f92d5c 100755
--- a/tools/vipsprofile
+++ b/tools/vipsprofile
@@ -136,11 +136,12 @@ for thread in threads:
 print 'last time =', last_time
 
 # calculate some simple stats
-print 'name\t\t\talive\twait\twait%'
+print 'name\t\t\t\talive\twait%\twork%\tunk%'
 for thread in threads:
     start = last_time
     stop = 0
     wait = 0
+    work = 0
     for event in thread.events:
         if event.start < start:
             start = event.start
@@ -148,9 +149,14 @@ for thread in threads:
             stop = event.stop
         if event.wait:
             wait += event.stop - event.start
+        if event.work:
+            work += event.stop - event.start
 
     alive = stop - start
-    print thread.thread_name, alive, wait, 100 * wait / alive
+    wait_percent = 100 * wait / alive
+    work_percent = 100 * work / alive
+    unkn_percent = 100 - (wait_percent + work_percent)
+    print '%30s\t%6.2g\t%.3g\t%.3g\t%.3g' % (thread.thread_name, alive, wait_percent, work_percent, unkn_percent)
 
 # do two gates overlap?
 def is_overlap(events, gate_name1, gate_name2):
@@ -175,12 +181,31 @@ total_y = 0
 for thread in threads:
     thread.total_y = total_y
 
-    y = 1
+    # first pass .. move work and wait events to y == 0
     gate_positions = {}
     for event in thread.events:
-        if event.work or event.wait:
+        if not event.work and not event.wait:
+            continue
+
+        # no works and waits must overlap
+        if not event.gate_name in gate_positions:
+            for gate_name in gate_positions:
+                if is_overlap(thread.events, event.gate_name, gate_name):
+                    print 'gate', event.gate_name, 'and', gate_name, 'overlap'
+                    break
+
             gate_positions[event.gate_name] = 0
-        elif not event.gate_name in gate_positions:
+
+        event.y = gate_positions[event.gate_name] 
+        event.total_y = total_y + event.y
+
+    # second pass: move all other events to non-overlapping ys
+    y = 1
+    for event in thread.events:
+        if event.work or event.wait:
+            continue
+
+        if not event.gate_name in gate_positions:
             no_overlap = False
             for gate_name in gate_positions:
                 if not is_overlap(thread.events, gate_name, event.gate_name):