#!/bin/nawk -f - # # Accepts input lines of the form: # # job-name job-id { "start", "end" } timestamp # # So: ftp AA1234 start 829107402 # telnet AB767 start 829107422 # telnet AB767 end 829107477 # ftp AA1234 end 829108382 # etc.. # # DO NOT USE a '+' or a '$' in the job-name or job-id fields. The timestamps # should be in seconds (or any other simple integer). The end record for any # job must come AFTEER the start, but otherwise we don't care much about # order since we sort. # # It divides the jobs up by duration, according to the specified granularity. # That is, the granularity determines the length of a 'tick' in seconds, and # jobs that take the same number of ticks are lumped together. # # The job-name and the # of ticks it took determine a job's 'class', which # is stored as something like 'ftp+7' for 'ftp jobs that took 7 ticks'. # The start times for all the jobs in the world are stored away, and then # sorted, according to class. The intervals between all the start times are # checked, and used to determine a max and a min value for the possible # gaps between job-starts in a given class. This essentially simulates # the input load by assuming the job start times in the input are # UNIFORMLY distributed between the max and min interval. This is likely # to be false. Most times, say, you (in real life) start a job once or # twice a minute, but then there's a 2 hour gap by accident. This guy will # start jobs up at random intervals of 1 minute to 2 hours, based on # that input. Edit the output to whack the outliers, perhaps. Or feel # free to beef up the statistical modelling in this thing or workload(1) ;) # # But do see the comments in the BEGIN section for some variations. # # Then a parallel {} specification is emitted, with a bunch of clauses # each a 'random {}' specification for the job specified. The class name # with the + editted out, for example class 'ftp+7' is run as command # 'ftp 7', and the max and min times are, well, the max and min times to # use for sequencing job starts. # # To generate an Nx load, edit the 'parallel {' at the beginning to # 'parallel N {' # # To run the same synthetic job over and over again, wrap the whole file # up in 'sequential N {' and '} at the end. # BEGIN { granularity=30; seed="pid"; # seed for the 'random' specifications # For job classes with only one job, you can't compute any intervals # between jobs, so you just use these values as the max/min for # the random {} spec. maxtime=100; mintime=0; verbose=0; # A debug flag, really # If you want to use average start times, set these up.. # this computes the average interval between job starts, and # models the workload as uniformly distributed inter-job-start # gaps between [avg - delta , avg + delta] use_averages = 0; # If you're using averages, the average interval + or - this delta # will be the max and min times, not the actual observed max/min averages_delta = 3; # If you want to chop some range of the head and tail, set these # This model counts all the inter-job-start intervals for a job # class, and chops off the bottom (low_percentile)% and the top # (high_percentile)% from that list, and then models the workload # as uniformly distributed inter-job-start gaps between the bottom # and the top of the remaining list. Roughly, it chops off the # outlying observed inter-job-start intervals at the indicated # percentages. use_percentiles = 1; low_percentile = 45; high_percentile = 55; if(use_averages && use_percentiles) { print "ERROR: Can't use both averages and percentiles"; exit; } } $3 == "start" { start[$1 "+" $2] = $4; } $3 == "end" { if(start[$1 "+" $2] > 0) { duration = $4 - start[$1 "+" $2]; if(duration >= 0) { duration = int(duration / granularity); if(verbose) print "# Job " $1 "+" $2 " took " \ duration " ticks."; class = $1 "+" duration; store_job(class, start[$1 "+" $2]); } else { print "# Error at " NR " Job " $1 " " $2 " took " \ duration " seconds?"; } } else { print "# Error at line " NR " no start record for job " \ $1 " " $2; } } END { do_min_max(); print "parallel {"; for(c in minimums) { cmd = c; gsub(/\+/," ",cmd); if(next_id[c] == 2) { print "\t# Only one job in class " c; print "\trandom 1 " seed " " mintime " " maxtime " {" print "\t\tcommand \"" cmd "\""; print "\t}"; continue; } # We had multiple jobs in the class.. print "\t# " next_id[c]-1 " jobs in class " c; print "\trandom " next_id[c]-1 " " seed " " minimums[c] " " \ maximums[c] " {"; print "\t\tcommand \"" cmd "\""; print "\t}" } print "}" } # # Stores a specified start time into a job class # function store_job(c, s, id) { if(next_id[c] > 0) { id = next_id[c]; next_id[c] = id + 1; } else { id = 1; next_id[c] = 2; } starttimes[c "$" id] = s; classes[c] = 1; # It exists } # We have divided all the jobs input into classes, and stored the # start times for every job in every class. Walk the list of all # the starttimes, looking for the min and the max intervals between # start times. function do_min_max(c, i, mn, mx, total,lo, hi) { for(c in classes) { mn = 1000000; mx = 0; total = 0; quicksort(c, starttimes, 1, next_id[c]-1); for(i = 1; i < next_id[c]-1; i++) { interval = starttimes[c "$" (i+1)] - \ starttimes[c "$" i]; total += interval; if(interval < mn) { mn = interval; } if(interval > mx) { mx = interval; } if(use_percentiles) { intervals["X$" i] = interval; } } averages[c] = int(total / next_id[c]-1); if(use_averages) { minimums[c] = averages[c] - averages_delta; maximums[c] = averages[c] + averages_delta; } else if(use_percentiles && next_id[c] > 2) { if(verbose) { print "INTERVALS for class " c; for(i = 1; i < next_id[c]-1; i++) print intervals["X$" i]; } quicksort("X", intervals, 1, next_id[c]-2); if(verbose) { print "POST SORT INTERVALS for class " c; for(i = 1; i < next_id[c]-1; i++) print intervals["X$" i]; } # Chop off the low & the high ends lo = int((low_percentile * (next_id[c]-1)) / 100); hi = int((high_percentile * (next_id[c]-1)) / 100); minimums[c] = intervals["X$" lo]; maximums[c] = intervals["X$" hi]; } else { minimums[c] = mn; maximums[c] = mx; } } } # # Quicksort, from The AWK Book # function quicksort(class, A, left, right, i, last) { if(left >= right) return; swap(class, A, left, left + int((right - left + 1)*rand())); last = left; for(i = left+1; i <= right; i++) if(A[class "$" i] < A[class "$" left]) swap(class, A, ++last, i); swap(class, A, left, last); quicksort(class, A, left, last-1); quicksort(class, A, last+1, right); } function swap(class, A, i, j, t) { t = A[class "$" i]; A[class "$" i] = A[class "$" j]; A[class "$" j] = t; }