#!/usr/bin/python # vim: set fileencoding=UTF-8 #entropyStats.py # #Script for computing stats relating to guessing difficulty of a distribution # #Input format: # #Label-1, count-1 #Label-1, count-2 #... # # #Labels can be anything (with no commas) #Counts can be int or float # # #January 20, 2010 #Joseph Bonneau #jcb82@cl.cam.ac.uk import getpass import sys import os import sys import time import math import numpy as np from scipy import linspace, polyval, polyfit, sqrt, stats, randn, array from pylab import plot, loglog, title, show , legend, xlim, ylim import matplotlib.pyplot as plot from matplotlib import rc from matplotlib.pyplot import savefig from matplotlib.transforms import offset_copy #For output of graphs with TeX fonts to match paper rc('font',**{'family':'serif','serif':['Computer Modern']}) rc('text', usetex=True) #Value of beta to print in the table lambdaPrint = 3 #Series of labels to use when printing tables or graphs, replacing less elegant filenames titles = {'SKSurnames.csv':'South Korea', 'USSurnames.csv':'USA', 'ChileSurnames.csv':'Chile', 'EstoniaSurnames.csv':'Estonia', 'JapanSurnames.csv':'Japan', 'EWSurnames.csv':'England', 'SpainSurnames.csv' : 'Spain', 'Spain100ForenamesMaleC.csv' : 'Spain', 'Maria carmen' : 'Maria', 'Antonio' : 'Jose', 'Spain100ForenamesFemaleC.csv' : 'Spain', 'PINS.csv' : 'PIN', 'Spain100ForenamesAllC.csv' : 'Spain', 'Spain100ForenamesMaleS.csv' : 'Spain', 'Spain100ForenamesFemaleS.csv' : 'Spain', 'Spain100ForenamesAllS.csv' : 'Spain', 'SpainSurnames.csv' : 'Spain', 'FinlandSurnames.csv':'Finland', 'mascots.csv':'School Mascots (US)', 'EAGLES':'Eagles', 'NorwaySurnames.csv':'Norway', 'AustSurnames.csv':'Australia', 'Gonz\xc3\x81lez':'Gonz\\\'{a}lez', 'Maria gonzalez':'{\\scriptsize Maria Gonzalez}', '佐藤':'Sat\\={o}', '김':'Kim', 'Jón':'J\\\'{o}n', 'Guðrún':'Gu\dh{}r\\\'{u}n', 'Holyrood secondary school':'Holyrood', 'Essex primary school':'Essex', 'LADogs.csv':'Los Angeles', 'SFDogs.csv':'San Francisco', 'DMDogs.csv':'Des Moines', #'FBSurnames.csv':'Facebook', #'FBForenames.csv':'Facebook', 'FBSurnames.csv':'Surname', 'FBForenames.csv':'Forename', 'FBSurnamesSpanish.csv':'Facebook (Sp.)', 'FBForenamesSpanish.csv':'Facebook (Sp.)', 'FBAllNames.csv':'Full name', 'BelgiumForenamesFemale2008.csv':'Belgium', 'BelgiumForenamesMale2008.csv':'Belgium', 'BelgiumForenamesAll2008.csv':'Belgium', 'EWPrimary.csv':'UK Primary Schools', 'FaroeIslandsForenamesFemale2008.csv':'Faroe Islands', 'FaroeIslandsForenamesMale2008.csv':'Faroe Islands', 'FaroeIslandsForenamesAll2008.csv':'Faroe Islands', 'Iceland100ForenamesFemale2007.csv':'Iceland', 'Iceland100ForenamesMale2007.csv':'Iceland', 'Iceland100ForenamesAll2007.csv':'Iceland', 'ScotHigh.csv':'UK High Schools', 'TouristCities150.csv': 'Tourist Destinations', 'UK100Cities.csv': 'UK Cities', 'US1000ForenamesFemale1950s.csv':'USA, 1950 (\\female)', 'US1000ForenamesFemale1960s.csv':'USA, 1960 (\\female)', 'US1000ForenamesFemale1970s.csv':'USA, 1970 (\\female)', 'US1000ForenamesFemale1980s.csv':'USA, 1980 (\\female)', 'US1000ForenamesFemale1990s.csv':'USA, 1990 (\\female)', 'US1000ForenamesFemale2000s.csv':'USA, 2000 (\\female)', 'US1000ForenamesMale1950s.csv':'USA, 1950 (\\male)', 'US1000ForenamesMale1960s.csv':'USA, 1960 (\\male)', 'US1000ForenamesMale1970s.csv':'USA, 1970 (\\male)', 'US1000ForenamesMale1980s.csv':'USA, 1980 (\\male)', 'US1000ForenamesMale1990s.csv':'USA, 1990 (\\male)', 'US1000ForenamesMale2000s.csv':'USA, 2000 (\\male)', 'USForenamesMaleAll.csv':'USA', 'USForenamesFemaleAll.csv':'USA', 'USForenamesAll.csv':'USA', 'SKSurnames.csv':'South Korea', 'SKSurnames.csv':'South Korea', 'SKSurnames.csv':'South Korea', 'SKSurnames.csv':'South Korea' } #Total populations for data files, as some data files don't represent the total population #If nothing entered, assume the total population is represented populations = {'USSurnames.csv':281421906, 'EWSurnames.csv':55918842, 'FinlandSurnames.csv':5342936, 'NorwaySurnames.csv':4812200, 'SKSurnames.csv':45964816, 'SpainSurnames.csv':46157822, 'Spain100ForenamesMaleC.csv' : 22847737, 'Spain100ForenamesFemaleC.csv' : 23310085, 'Spain100ForenamesAllC.csv' : 46157822, 'Spain100ForenamesMaleS.csv' : 22847737, 'Spain100ForenamesFemaleS.csv' : 23310085, 'Spain100ForenamesAllS.csv' : 46157822, 'ChileSurnames.csv':16763470, 'EstoniaSurnames.csv':1340000, 'EstoniaSurnames.csv':1340000, #based on Wikipedia scaling 'JapanSurnames.csv':30000000, #based on existence of singletons, removing Aust limit #'AustSurnames.csv':20700000, 'mascots.csv': 24292, 'USSurnames.csv':281421906, 'USSurnames.csv':281421906, #estimate from LA data 'DMDogs.csv':30820, 'USSurnames.csv':281421906, 'USSurnames.csv':281421906, 'USSurnames.csv':281421906, 'USSurnames.csv':281421906, 'USSurnames.csv':281421906, 'USSurnames.csv':281421906, 'USSurnames.csv':281421906, 'USSurnames.csv':281421906, 'belgiumForenamesFemale2008.csv':5139750, 'belgiumForenamesMale2008.csv':5360250, 'belgiumForenamesAll2008.csv':(5360250 + 5139750), 'Iceland100ForenamesFemale2007.csv':150412, 'Iceland100ForenamesMale2007.csv':150592, 'Iceland100ForenamesAll2007.csv':(150592 + 150412), 'TouristCities150.csv':939792, 'UK100Cities.csv':60800000, 'US1000ForenamesFemale1950s.csv':19726424, 'US1000ForenamesFemale1960s.csv':18895598, 'US1000ForenamesFemale1970s.csv':16454275, 'US1000ForenamesFemale1980s.csv':18440310, 'US1000ForenamesFemale1990s.csv':19629836, 'US1000ForenamesFemale2000s.csv':18224249, 'US1000ForenamesMale1950s.csv':20497306, 'US1000ForenamesMale1960s.csv':19609110, 'US1000ForenamesMale1970s.csv':17093407, 'US1000ForenamesMale1980s.csv':19213407, 'US1000ForenamesMale1990s.csv':20534419, 'US1000ForenamesMale2000s.csv':19078548, 'USForenamesMaleAll.csv':(20497306+19609110+17093407+19213407+20534419+19078548), 'USForenamesFemaleAll.csv':(19726424+18895598+16454275+18440310+19629836+18224249), 'USForenamesAll.csv':(20497306+19609110+17093407+19213407+20534419+19078548+19726424+18895598+16454275+18440310+19629836+18224249), 'forenames_fixed.txt':142009008, 'USForenamesAll1990.csv':279310741, } #Plot several distributions, possibly against some standard data points def plotDistribution(x, ys,xTitle = None, yTitle = None, plotTitle = None, legendPlacement = "lower right", numMarks = 10, axisV = None, saveLoc = "plot_output.pdf", showPlot = True, showPresetData = True, colors = ('k'), marks = ('',''), lineStyles = ('-'), markerSize = 5): fig = plot.figure() ax = fig.add_subplot(111) if plotTitle is not None: title(plotTitle) legend_text = [] #Plot passed in points. Each should be of the form (dataArray, title) #Iterate through passed in colors, marks, and lineStyles count = 0 for y in ys: print colors[count % len(colors)] print lineStyles[count % len(lineStyles)] print marks[count % len(marks)] label = "" + colors[count % len(colors)] + lineStyles[count % len(lineStyles)] + marks[count % len(marks)] count += 1 lines = ax.plot(x,y[0],label) plot.setp(lines, 'ms', markerSize) if y[1] in titles: legend_text.append(titles[y[1]]) else: legend_text.append(y[1]) #Include hard-coded data points comparing to other authentication means if showPresetData: key_points = [] key_points.append(('Password [Klein]', 'bo', [0.0007, 0.0397, 0.2421], [15.63, 15.78, 18.09])) key_points.append(('Password [Spafford]', 'co', [0.045, 0.1998], [19.09, 23.58])) key_points.append(('Password [Schneier]', 'yo', [0.0131, 0.23, 0.55], [10.58, 30.54, 33.29])) key_points.append(('Pass-Go', 'm^', [0.19, 0.40], [38.3, 44.12])) key_points.append(('PassPoints', 'g>', [0.1228, 0.1667, 0.1930], [19.23, 24.58, 25.77])) key_points.append(('Passfaces', 'rv', [0.1, 0.25], [4.32, 5.70])) key_points.append(('Handwriting', 'k*', [0.155, 0.5], [2.689, 23])) for k in key_points: ax.plot(k[2], k[3], k[1], markersize = 6) legend_text.append(k[0]) if axisV is not None: ax.axis(axisV) legend(legend_text, loc = legendPlacement) if xTitle is not None: ax.set_xlabel(xTitle) if yTitle is not None: ax.set_ylabel(yTitle) if saveLoc is not None: savefig(saveLoc, format='pdf') if showPlot: show() #Shape a distribution, finding the maximum #d is a distribution, stats it's stats, rMax the limiting r value def shapeDist(d, stats, rMax, debug = False): results = [] if debug: print d if debug: print "shaping for r", rMax index = 0 count = 0 pCum = 0.0 pMax = 1.0 #iteratively add events from the head of the distribution which we can safely shape down, until #we can't shape p_1 through p_m to be equal to p_m+1 without exceeding rMax while index < len(d) - 1: #cumulative probability which has been shaped pCum += d[index][0] * d[index][1] count += d[index][1] #test whether we can lower all events up to index without exceeding rMax if(pCum - count * d[index + 1][0] > rMax): #this will be the maximum probability we can shape all events up to index to pMax = float(pCum - rMax) / count break index += 1 #if we were able to shape all events, don't shape the final one if (index == len(d) - 1): pMax = d[len(d) - 1][0] index -= 1 #array of acceptance rates for each event a = [1.0] * len(d) #overall acceptance rate - build this up to double check that predicted value is within limit rActual = 0.0 #compute acceptance rates for i in range(0, index + 1): a[i] = pMax / d[i][0] rActual += d[i][1] * (1 - a[i]) * d[i][0] #final values in distribution - scale up by 1/rActual for i in range(0, len(d)): d[i] = (d[i][0] * a[i] / (1 - rActual), d[i][1]) if debug: print 'pMax', pMax if debug: print 'index', index if debug: print 'rMax', rMax if debug: print 'rActual', rActual if debug: print d, "\n" stats['rActual'] = rActual stats['r0'] = 1.0-a[0] return #Compute a suite of stats on the distribution d #alphaStep controls the granularity for which mu_alpha is computed #maxBeta is the maximum beta value for which lambda_beta is computed def computeStats(d, stats, maxAlpha = 1.0, alphaStep = 0.01, maxBeta = 20, f = None, debug = False): N = stats['totalKnown'] #total of all events in distribution popN = stats['total'] #total of all events, including unknowns #mu values, both numeric and converted to bits (t) nmu = [] tmu= [] #lambda values, both numeric and converted to bits (t) nlambda = [] tlambda = [] h1 = 0.0 #Shannon entropy H1 h2 = 0.0 #Reny entropy H2 g = 0.0 #Guess entropy G gini = 0 #Gini coefficient index = 0 pCum = 0.0 alphaCum = 0.0 #iterate through each event, building up the summation stats i = 0 j = 0 while True: if pCum == 1: break index += 1 if i < len(d): p = float(d[i][0]) / popN #next probability #step i iteratively, repeating for multiple events with same probability j+=1 if j >= d[i][1]: i+= 1 j = 0 if i == len(d): stats['pKnown'] = pCum #record total known probability else: p = float(d[len(d) - 1][0]) / popN #repeat last known probability p = min(p, 1 - pCum) #last probability may need to be smaller to avoid going over 1 if debug: print p #update summation stats pCum += p gini += pCum - (float(index + 1) / popN) h1 += - p * math.log(p, 2) g += p * (index + 1) h2 += p * p #fill in as many values of mu as possible #may be multiple if probability is bigger than alphaStep while(pCum - alphaCum >= alphaStep and alphaCum < maxAlpha): alphaCum += alphaStep nmu.append(index) if debug: print i, alphaCum tmu.append(math.log(float(index) / alphaCum, 2)) #fill in next lambda value if index < maxBeta: nlambda.append(pCum) tlambda.append(math.log(float(index) / pCum, 2)) if debug: if i == len(d) - 1: print pCum, alphaCum print gini / N #fixup last item if(len(nmu) < (1.0 / alphaStep)): alphaCum += alphaStep nmu.append(index + 1) if debug: print index, alphaCum tmu.append(math.log(float(index + 1) / alphaCum, 2)) #finalize statitsics h2 = -1 * math.log(h2, 2) gt = math.log(2 * g - 1, 2) gini /= popN gini *= 2 #record into lookup table stats['N'] = index stats['Total'] = popN stats['H0'] = math.log(stats['N'], 2) stats['H1'] = h1 stats['H2'] = h2 stats['G'] = g stats['~G'] = gt stats['Hmin'] = -math.log(float(d[0][0]) / popN, 2) stats['Gini'] = gini stats['mu'] = nmu stats['~mu'] = tmu stats['lambda'] = nlambda stats['~lambda'] = tlambda stats['alphaStep'] = alphaStep return stats #Read a data file and compute its statistics def readFile(filename, shapeR = None): print "Reading File:", filename f = open(filename, 'r') n = 0 d = [] stats = {} stats['title'] = filename #replace title if known if stats['title'] in titles: stats['title'] = titles[stats['title']] descending = True last = None total = 0 while True: next = f.readline() if next == '': break ids = next.split(',') if len(ids) > 1: count = ids[1] if total == 0: name = ids[0] name = name.strip() name = name.strip('\"') name = name.strip() stats['x0'] = str(name[0].upper() + name[1:].lower()) if stats['x0'] in titles: stats['x0'] = titles[stats['x0']] else: count = ids[0] count = count.strip('\n') try: #interpret as a percent if count.find('.') > 0: count = int(float(count) * 0.01 * populations[filename]) count = int(count) except ValueError: print 'Skipping line', next continue except KeyError: print 'Can\'t use decimals in line', next continue if count <= 0: continue if last is not None and count > last: descending = False if count == last: d[len(d) - 1][1] += 1 else: d.append([count, 1]) last = count total += count n += 1 if n %1000 == 0: print '.', print 'done', #if data file wasn't in descending order, then sort it if descending: print 'sorting correct' else: print 'sorting incorrect, fixing' d.sort(cmp=lambda x,y: -cmp(x[0], y[0])) newd = [] last = -1 for i in d: if i[0] == last: newd[len(newd) - 1][1] += 1 else: newd.append([i[0], 1]) last = i[0] else: d.append((count, 1)) print 'sorting done' print newd d = newd stats['totalKnown'] = total if filename in populations: stats['total'] = populations[filename] else: stats['total'] = total stats['trueTotal'] = stats['total'] computeStats(d, stats) results = [] results.append((d, stats, filename)) #compute various shapings, if passed in if shapeR is not None: for r in shapeR: dg = [] for x in d: dg.append((float(x[0]) / stats['total'], x[1])) sg = {} shapeDist(dg, sg, r) sg['title'] = str(stats['title']) + " (r=" + str(r) + ")" sg['x0'] = stats['x0'] sg['total'] = 1.0 sg['trueTotal'] = stats['trueTotal'] sg['totalKnown'] = float(stats['total'])/ float(stats['totalKnown']) computeStats(dg, sg) results.append((dg, sg, filename)) return results #Print statistics about the distribution def printStats(xs, plotMu = False, sortDists = True, strings = []): if len(xs) == 0: return print "\n-----------------------------\nStatistics Dump\n-----------------------------\n" for x in xs: print "Stats for distibution in file: ", x[2] for k in x[1]: print k, '\t', x[1][k] print "\n-----------------------------\nMarginal Guess Prob\n-----------------------------\n" mu_len = len(xs[0][1]['~mu']) print "mu", for x in xs: print str("\t" + x[1]['title']), print "\n", for i in xrange(mu_len): print (i + 1) * xs[0][1]['alphaStep'], for x in xs: print str("\t" + "%.5f"%x[1]['~mu'][i]), print "\n", x_coord = [] j = 0 for i in xrange(0, mu_len): j += xs[0][1]['alphaStep'] x_coord.append(j) ys = [] for x in xs: ys.append((x[1]['~mu'], x[2])) ys.sort(cmp=lambda x,y: cmp(x[0][mu_len - 1], y[0][mu_len - 1])) if plotMu: plotDistribution(x_coord, ys, yTitle = "marginal guesswork\\ $\\tilde{\\mu}_{\\alpha}$", xTitle = "success rate\\ $\\alpha$", legendPlacement = "upper right", axisV = (0, 1, 0, 50), colors = ('k', 'g', 'g', 'r', 'c', 'm', 'y'), marks = ('', ''), lineStyles = ('-', '-.', '--')) print "\n-----------------------------\nMarginal Success Rate \n-----------------------------\n" print "lambda", for x in xs: print str("\t" + x[1]['title']), print "\n", for i in xrange(len(xs[0][1]['~lambda'])): print (i + 1), for x in xs: if (len(x[1]['~lambda']) > i): print str("\t" + "%.5f"%x[1]['~lambda'][i]), print "\n", lambda_len = len(xs[0][1]['~lambda']) x_coord = [] j = 0 for i in xrange(0, lambda_len): x_coord.append(i + 1) ys = [] for x in xs: ys.append((x[1]['~lambda'], x[2])) if sortDists: ys.sort(cmp=lambda x,y: -cmp(x[0][lambda_len - 1], y[0][lambda_len - 1])) if plotMu: plotDistribution(x_coord, ys, yTitle = "marginal success rate (lambda)", xTitle = "guesses per question (beta)") precision = "%.1f" endSpacing = "\lts" stringIndex = 0 #print LaTeX table for inclusion in paper for i in xrange(len(xs)): x = xs[i] while(stringIndex < len(strings) and strings[stringIndex][0] == i): print strings[stringIndex][1] stringIndex += 1 print str(x[1]['title'] + " & " + precision%x[1]['H0'] + " & " + precision%x[1]['H1'] + " & " + precision%x[1]['~G'] + " & " + precision%x[1]['H2'] + " & " + precision%x[1]['~mu'][int(0.5/x[1]['alphaStep']) - 1] + " & " + precision%x[1]['~lambda'][lambdaPrint - 1] + " & " + precision%x[1]['Hmin'] + " & " + endSpacing + " " + x[1]['x0']), print "\\\\\n", #------------------------- #Main function #------------------------- if len(sys.argv) < 1: print "usage entropyStats.py [files...]" plotMu = True numR = -1 plotShapes = (numR < 0) rs = [] for i in xrange(numR + 1): rs.append(i / float(numR)) shapeStats = [] shapeStats.append(('$\\tilde{\\mu}_{\\frac{1}{2}}$', lambda x: x[1]['~mu'][int(0.5/x[1]['alphaStep']) - 1])) shapeStats.append(('$\\tilde{\\lambda}_3$', lambda x: x[1]['~lambda'][2])) gs = [] extraTableStrings = [] xs = [] for f in range(1, len(sys.argv)): #Any string starting with a '*' will be an extra string for printing the table if sys.argv[f][0] == '*': next = sys.argv[f][1:] extraTableStrings.append((len(xs), next)) continue dists = readFile(sys.argv[f], shapeR = rs) xs.extend(dists) #Add shaped distributions for j in xrange(len(shapeStats)): g = [] label = str(sys.argv[f]) if label in titles: label = titles[label] for i in xrange(numR + 1): g.append(shapeStats[j][1](dists[1 + i])) gs.append((g, "\\vphantom{$\\tilde{\\mu}_{\\frac{1}{2}}$ ()}" + shapeStats[j][0] + '\\ (' + label + ')' )) printStats(xs, plotMu, strings = extraTableStrings) if plotShapes: plotDistribution(rs, gs, xTitle = "Rejection rate\\ $r_*$", yTitle = "Effective security (bits)", colors = ('b', 'b', 'r', 'r', 'm', 'y'), marks = ('', '', '', '', 's', 's', 'o', 'o', '>', 's', 'p', '*', 'h', '+', 'x', 'D'), lineStyles = ('-', '--'), saveLoc = "shape_output.pdf", showPresetData = False)