#!/usr/bin/python
# vim: set fileencoding=UTF-8

#entropyStats.py
#
#Script for computing stats relating to guessing difficulty of a distribution
#
#Input format:
#
#Label-1, count-1
#Label-1, count-2
#...
#
#
#Labels can be anything (with no commas)
#Counts can be int or float
#
#
#January 20, 2010
#Joseph Bonneau
#jcb82@cl.cam.ac.uk


import getpass
import sys
import os
import sys
import time
import math
import numpy as np
from scipy import linspace, polyval, polyfit, sqrt, stats, randn, array
from pylab import plot, loglog, title, show , legend, xlim, ylim
import matplotlib.pyplot as plot
from matplotlib import rc
from matplotlib.pyplot import savefig
from matplotlib.transforms import offset_copy

#For output of graphs with TeX fonts to match paper
rc('font',**{'family':'serif','serif':['Computer Modern']})
rc('text', usetex=True)

#Value of beta to print in the table
lambdaPrint = 3

#Series of labels to use when printing tables or graphs, replacing less elegant filenames
titles = {'SKSurnames.csv':'South Korea',
'USSurnames.csv':'USA',
'ChileSurnames.csv':'Chile',
'EstoniaSurnames.csv':'Estonia',
'JapanSurnames.csv':'Japan',
'EWSurnames.csv':'England',
'SpainSurnames.csv' : 'Spain',
'Spain100ForenamesMaleC.csv' : 'Spain',
'Maria carmen' : 'Maria',
'Antonio' : 'Jose',
'Spain100ForenamesFemaleC.csv' : 'Spain',
'PINS.csv' : 'PIN',
'Spain100ForenamesAllC.csv' : 'Spain',
'Spain100ForenamesMaleS.csv' : 'Spain',
'Spain100ForenamesFemaleS.csv' : 'Spain',
'Spain100ForenamesAllS.csv' : 'Spain',
'SpainSurnames.csv' : 'Spain',
'FinlandSurnames.csv':'Finland',
'mascots.csv':'School Mascots (US)',
'EAGLES':'Eagles',
'NorwaySurnames.csv':'Norway',
'AustSurnames.csv':'Australia',
'Gonz\xc3\x81lez':'Gonz\\\'{a}lez',
'Maria gonzalez':'{\\scriptsize Maria Gonzalez}',
'佐藤':'Sat\\={o}',
'김':'Kim',
'Jón':'J\\\'{o}n',
'Guðrún':'Gu\dh{}r\\\'{u}n',
'Holyrood secondary school':'Holyrood',
'Essex primary school':'Essex',
'LADogs.csv':'Los Angeles',
'SFDogs.csv':'San Francisco',
'DMDogs.csv':'Des Moines',
#'FBSurnames.csv':'Facebook',
#'FBForenames.csv':'Facebook',
'FBSurnames.csv':'Surname',
'FBForenames.csv':'Forename',
'FBSurnamesSpanish.csv':'Facebook (Sp.)',
'FBForenamesSpanish.csv':'Facebook (Sp.)',
'FBAllNames.csv':'Full name',
'BelgiumForenamesFemale2008.csv':'Belgium',
'BelgiumForenamesMale2008.csv':'Belgium',
'BelgiumForenamesAll2008.csv':'Belgium',
'EWPrimary.csv':'UK Primary Schools',
'FaroeIslandsForenamesFemale2008.csv':'Faroe Islands',
'FaroeIslandsForenamesMale2008.csv':'Faroe Islands',
'FaroeIslandsForenamesAll2008.csv':'Faroe Islands',
'Iceland100ForenamesFemale2007.csv':'Iceland',
'Iceland100ForenamesMale2007.csv':'Iceland',
'Iceland100ForenamesAll2007.csv':'Iceland',
'ScotHigh.csv':'UK High Schools',
'TouristCities150.csv': 'Tourist Destinations',
'UK100Cities.csv': 'UK Cities',
'US1000ForenamesFemale1950s.csv':'USA, 1950 (\\female)',
'US1000ForenamesFemale1960s.csv':'USA, 1960 (\\female)',
'US1000ForenamesFemale1970s.csv':'USA, 1970 (\\female)',
'US1000ForenamesFemale1980s.csv':'USA, 1980 (\\female)',
'US1000ForenamesFemale1990s.csv':'USA, 1990 (\\female)',
'US1000ForenamesFemale2000s.csv':'USA, 2000 (\\female)',
'US1000ForenamesMale1950s.csv':'USA, 1950 (\\male)',
'US1000ForenamesMale1960s.csv':'USA, 1960 (\\male)',
'US1000ForenamesMale1970s.csv':'USA, 1970 (\\male)',
'US1000ForenamesMale1980s.csv':'USA, 1980 (\\male)',
'US1000ForenamesMale1990s.csv':'USA, 1990 (\\male)',
'US1000ForenamesMale2000s.csv':'USA, 2000 (\\male)',
'USForenamesMaleAll.csv':'USA',
'USForenamesFemaleAll.csv':'USA',
'USForenamesAll.csv':'USA',
'SKSurnames.csv':'South Korea',
'SKSurnames.csv':'South Korea',
'SKSurnames.csv':'South Korea',
'SKSurnames.csv':'South Korea'

}

#Total populations for data files, as some data files don't represent the total population
#If nothing entered, assume the total population is represented
populations = {'USSurnames.csv':281421906,
'EWSurnames.csv':55918842,
'FinlandSurnames.csv':5342936,
'NorwaySurnames.csv':4812200,
'SKSurnames.csv':45964816,
'SpainSurnames.csv':46157822,
'Spain100ForenamesMaleC.csv' : 22847737,
'Spain100ForenamesFemaleC.csv' : 23310085,
'Spain100ForenamesAllC.csv' : 46157822,
'Spain100ForenamesMaleS.csv' : 22847737,
'Spain100ForenamesFemaleS.csv' : 23310085,
'Spain100ForenamesAllS.csv' : 46157822,
'ChileSurnames.csv':16763470,
'EstoniaSurnames.csv':1340000,
'EstoniaSurnames.csv':1340000,
#based on Wikipedia scaling
'JapanSurnames.csv':30000000,
#based on existence of singletons, removing Aust limit
#'AustSurnames.csv':20700000,
'mascots.csv': 24292,
'USSurnames.csv':281421906,
'USSurnames.csv':281421906,
#estimate from LA data
'DMDogs.csv':30820,
'USSurnames.csv':281421906,
'USSurnames.csv':281421906,
'USSurnames.csv':281421906,
'USSurnames.csv':281421906,
'USSurnames.csv':281421906,
'USSurnames.csv':281421906,
'USSurnames.csv':281421906,
'USSurnames.csv':281421906,
'belgiumForenamesFemale2008.csv':5139750,
'belgiumForenamesMale2008.csv':5360250,
'belgiumForenamesAll2008.csv':(5360250 + 5139750),
'Iceland100ForenamesFemale2007.csv':150412,
'Iceland100ForenamesMale2007.csv':150592,
'Iceland100ForenamesAll2007.csv':(150592 + 150412),
'TouristCities150.csv':939792,
'UK100Cities.csv':60800000,
'US1000ForenamesFemale1950s.csv':19726424,
'US1000ForenamesFemale1960s.csv':18895598,
'US1000ForenamesFemale1970s.csv':16454275,
'US1000ForenamesFemale1980s.csv':18440310,
'US1000ForenamesFemale1990s.csv':19629836,
'US1000ForenamesFemale2000s.csv':18224249,
'US1000ForenamesMale1950s.csv':20497306,
'US1000ForenamesMale1960s.csv':19609110,
'US1000ForenamesMale1970s.csv':17093407,
'US1000ForenamesMale1980s.csv':19213407,
'US1000ForenamesMale1990s.csv':20534419,
'US1000ForenamesMale2000s.csv':19078548,
'USForenamesMaleAll.csv':(20497306+19609110+17093407+19213407+20534419+19078548),
'USForenamesFemaleAll.csv':(19726424+18895598+16454275+18440310+19629836+18224249),
'USForenamesAll.csv':(20497306+19609110+17093407+19213407+20534419+19078548+19726424+18895598+16454275+18440310+19629836+18224249),
'forenames_fixed.txt':142009008,
'USForenamesAll1990.csv':279310741,
}

#Plot several distributions, possibly against some standard data points
def plotDistribution(x, ys,xTitle = None, yTitle = None, plotTitle = None, legendPlacement = "lower right", numMarks = 10, axisV = None, saveLoc = "plot_output.pdf", showPlot = True, showPresetData = True, colors = ('k'), marks = ('',''), lineStyles = ('-'), markerSize = 5):

	fig = plot.figure()
	ax = fig.add_subplot(111)

	if plotTitle is not None: title(plotTitle)

	legend_text = []

	#Plot passed in points. Each should be of the form (dataArray, title)
	#Iterate through passed in colors, marks, and lineStyles
	count = 0
	for y in ys:
	
		print colors[count % len(colors)]
		print lineStyles[count % len(lineStyles)]
		print marks[count % len(marks)]
		label = "" + colors[count % len(colors)] + lineStyles[count % len(lineStyles)] + marks[count % len(marks)]
		count += 1

		lines = ax.plot(x,y[0],label)
		plot.setp(lines, 'ms',  markerSize)

		if y[1] in titles: legend_text.append(titles[y[1]])
		else: legend_text.append(y[1])
		
	#Include hard-coded data points comparing to other authentication means
	if showPresetData:
		key_points = []
		key_points.append(('Password [Klein]', 'bo', [0.0007, 0.0397, 0.2421],  [15.63, 15.78, 18.09]))
		key_points.append(('Password [Spafford]', 'co', [0.045, 0.1998],  [19.09, 23.58]))
		key_points.append(('Password [Schneier]', 'yo', [0.0131, 0.23, 0.55],  [10.58, 30.54, 33.29]))
		key_points.append(('Pass-Go', 'm^', [0.19, 0.40], [38.3, 44.12]))
		key_points.append(('PassPoints', 'g>', [0.1228, 0.1667, 0.1930], [19.23, 24.58, 25.77]))
		key_points.append(('Passfaces', 'rv', [0.1, 0.25], [4.32, 5.70]))
		key_points.append(('Handwriting', 'k*', [0.155, 0.5], [2.689, 23]))

		for k in key_points:
			ax.plot(k[2], k[3], k[1], markersize = 6)
			legend_text.append(k[0])

	if axisV is not None:
		ax.axis(axisV)

	legend(legend_text, loc = legendPlacement)

	if xTitle is not None: ax.set_xlabel(xTitle)
	if yTitle is not None: ax.set_ylabel(yTitle)

	if saveLoc is not None: savefig(saveLoc, format='pdf')

	if showPlot: show()


#Shape a distribution, finding the maximum
#d is a distribution, stats it's stats, rMax the limiting r value
def shapeDist(d, stats, rMax, debug = False):

	results = []
	
	if debug: print d
	if debug: print "shaping for r", rMax

	index = 0			
	count = 0
	pCum = 0.0
	pMax = 1.0

	#iteratively add events from the head of the distribution which we can safely shape down, until 
	#we can't shape p_1 through p_m to be equal to p_m+1 without exceeding rMax
	while index < len(d) - 1:
		
		#cumulative probability which has been shaped
		pCum += d[index][0] * d[index][1]
		count += d[index][1]

		#test whether we can lower all events up to index without exceeding rMax
		if(pCum - count * d[index + 1][0] > rMax):
			#this will be the maximum probability we can shape all events up to index to
			pMax = float(pCum - rMax) / count
			break

		index += 1

	#if we were able to shape all events, don't shape the final one
	if (index == len(d) - 1): 
		pMax = d[len(d) - 1][0]		
		index -= 1

	#array of acceptance rates for each event
	a = [1.0] * len(d)

	#overall acceptance rate - build this up to double check that predicted value is within limit
	rActual = 0.0

	#compute acceptance rates
	for i in range(0, index + 1):
		a[i] = pMax / d[i][0]
		rActual += d[i][1] * (1 - a[i]) * d[i][0]

	#final values in distribution - scale up by 1/rActual
	for i in range(0, len(d)):
		d[i] = (d[i][0] * a[i] / (1 - rActual), d[i][1])


	if debug: print 'pMax', pMax
	if debug: print 'index', index
	if debug: print 'rMax', rMax
	if debug: print 'rActual', rActual
	if debug: print d, "\n"

	stats['rActual'] = rActual
	stats['r0'] = 1.0-a[0]

	return

#Compute a suite of stats on the distribution d
#alphaStep controls the granularity for which mu_alpha is computed
#maxBeta is the maximum beta value for which lambda_beta is computed
def computeStats(d, stats, maxAlpha = 1.0, alphaStep = 0.01, maxBeta = 20, f = None, debug = False):	

	N = stats['totalKnown']		#total of all events in distribution
	popN = stats['total']		#total of all events, including unknowns
	
	#mu values, both numeric and converted to bits (t)
	nmu = []
	tmu= []

	#lambda values, both numeric and converted to bits (t)
	nlambda = []
	tlambda = []

	h1 = 0.0	#Shannon entropy H1
	h2 = 0.0	#Reny entropy H2
	g = 0.0		#Guess entropy G
	gini = 0	#Gini coefficient

	index = 0
	pCum = 0.0
	alphaCum = 0.0

	#iterate through each event, building up the summation stats

	i = 0
	j = 0

	while True:
		
		if pCum == 1: break

		index += 1
		if i < len(d):
			p = float(d[i][0]) / popN	#next probability

			#step i iteratively, repeating for multiple events with same probability		
			j+=1
			if j >= d[i][1]: 
				i+= 1
				j = 0

			if i == len(d): stats['pKnown'] = pCum #record total known probability

		else:
			p = float(d[len(d) - 1][0]) / popN 	#repeat last known probability
			p = min(p, 1 - pCum)	#last probability may need to be smaller to avoid going over 1


		if debug: print p

		#update summation stats
		pCum += p
		gini += pCum - (float(index + 1) / popN)
		h1 += - p * math.log(p, 2)
		g += p * (index + 1)
		h2 += p * p

		#fill in as many values of mu as possible
		#may be multiple if probability is bigger than alphaStep
		while(pCum - alphaCum >= alphaStep and alphaCum < maxAlpha):
			alphaCum += alphaStep
			nmu.append(index)
			if debug: print i, alphaCum
			tmu.append(math.log(float(index) / alphaCum, 2))
			
		#fill in next lambda value
		if index < maxBeta:
			nlambda.append(pCum)
			tlambda.append(math.log(float(index) / pCum, 2))

		if debug: 
			if i  == len(d) - 1:
				print pCum, alphaCum
			print gini / N


	#fixup last item
	if(len(nmu) < (1.0 / alphaStep)):
		alphaCum += alphaStep
		nmu.append(index + 1)
		if debug: print index, alphaCum
		tmu.append(math.log(float(index + 1) / alphaCum, 2))

	#finalize statitsics
	h2 = -1 * math.log(h2, 2)
	gt = math.log(2 * g - 1, 2)
	gini /= popN
	gini *= 2

	#record into lookup table
	stats['N'] = index
	stats['Total'] = popN
	stats['H0'] = math.log(stats['N'], 2)
	stats['H1'] = h1
	stats['H2'] = h2
	stats['G'] = g
	stats['~G'] = gt
	stats['Hmin'] = -math.log(float(d[0][0]) / popN, 2)
	stats['Gini'] = gini

	stats['mu'] = nmu
	stats['~mu'] = tmu
	stats['lambda'] = nlambda
	stats['~lambda'] = tlambda
	
	stats['alphaStep'] = alphaStep

	return stats

#Read a data file and compute its statistics
def readFile(filename, shapeR = None):

	print "Reading File:", filename
	
	f = open(filename, 'r')

	n = 0
	d = []

	stats = {}
	stats['title'] = filename
	#replace title if known
	if stats['title'] in titles: stats['title'] = titles[stats['title']]

	descending = True
	last = None

	total = 0

	while True:
		next = f.readline()
		if next == '':
			break
		
		ids = next.split(',')

		if len(ids) > 1:
			count = ids[1]

			if total == 0:
				name = ids[0]
				name = name.strip()
				name = name.strip('\"')
				name = name.strip()
				stats['x0'] = str(name[0].upper() + name[1:].lower())
				if stats['x0'] in titles:
					stats['x0'] = titles[stats['x0']]

		else:	
			count = ids[0]

		count = count.strip('\n')

		try:
			#interpret as a percent
			if count.find('.') > 0:
				count = int(float(count) * 0.01 * populations[filename])

			count = int(count)

		except ValueError:
			print 'Skipping line', next
			continue
		except KeyError:
			print 'Can\'t use decimals in line', next
			continue
			
		if count <= 0:
			continue

		if last is not None and count > last:
			descending = False

		if count == last:
			d[len(d) - 1][1] += 1
		else:	
			d.append([count, 1])

		last = count
		total += count

		n += 1
		if n %1000 == 0:
			print '.',

	print 'done',

	#if data file wasn't in descending order, then sort it
	if descending: print 'sorting correct'
	else: 
		print 'sorting incorrect, fixing'
		d.sort(cmp=lambda x,y: -cmp(x[0], y[0]))
		newd = []
		last = -1
		for i in d:
			if i[0] == last:
				newd[len(newd) - 1][1] += 1
			else:
				newd.append([i[0], 1])	

			last = i[0]
		else:	
			d.append((count, 1))

		print 'sorting done'
	
		print newd
		d = newd

	stats['totalKnown'] = total
	if filename in populations: stats['total'] = populations[filename]
	else: stats['total'] = total
	stats['trueTotal'] = stats['total']

	computeStats(d, stats)

	results = []
	
	results.append((d, stats, filename))

	#compute various shapings, if passed in
	if shapeR is not None:
		for r in shapeR:
			dg = []
			for x in d:
				dg.append((float(x[0]) / stats['total'], x[1]))
			sg = {}
			shapeDist(dg, sg, r)
			sg['title'] = str(stats['title']) + " (r=" + str(r) + ")"
			sg['x0'] = stats['x0']
			sg['total'] = 1.0
			sg['trueTotal'] = stats['trueTotal']
			sg['totalKnown'] = float(stats['total'])/ float(stats['totalKnown'])
			computeStats(dg, sg)
			results.append((dg, sg, filename))

	return results

#Print statistics about the distribution
def printStats(xs, plotMu = False, sortDists = True, strings = []):


	if len(xs) == 0: return

	print "\n-----------------------------\nStatistics Dump\n-----------------------------\n"


	for x in xs:
		print "Stats for distibution in file: ", x[2]

		for k in x[1]:
			print k, '\t',  x[1][k]
	
		
	print "\n-----------------------------\nMarginal Guess Prob\n-----------------------------\n"

	mu_len = len(xs[0][1]['~mu'])
	print "mu",
	for x in xs:
		print str("\t" + x[1]['title']),

	print "\n",

	for i in xrange(mu_len):
		print (i + 1) * xs[0][1]['alphaStep'],
		for x in xs:
			print str("\t" + "%.5f"%x[1]['~mu'][i]),
		print "\n",


	x_coord = []
	j = 0
	for i in xrange(0, mu_len):
		j += xs[0][1]['alphaStep']	
		x_coord.append(j)
	
	ys = []
	for x in xs:
		ys.append((x[1]['~mu'], x[2]))

	ys.sort(cmp=lambda x,y: cmp(x[0][mu_len - 1], y[0][mu_len - 1]))
	
	if plotMu:
		plotDistribution(x_coord, ys, yTitle = "marginal guesswork\\ $\\tilde{\\mu}_{\\alpha}$", xTitle = "success rate\\ $\\alpha$", legendPlacement = "upper right", axisV = (0, 1, 0, 50), colors = ('k', 'g', 'g', 'r', 'c', 'm', 'y'), marks = ('', ''), lineStyles = ('-', '-.', '--')) 


	print "\n-----------------------------\nMarginal Success Rate \n-----------------------------\n"
	
	print "lambda",
	for x in xs:
		print str("\t" + x[1]['title']),

	print "\n",


	for i in xrange(len(xs[0][1]['~lambda'])):
		print (i + 1),
		for x in xs:
			if (len(x[1]['~lambda']) > i):
				print str("\t" + "%.5f"%x[1]['~lambda'][i]),
		print "\n",


	lambda_len = len(xs[0][1]['~lambda'])

	x_coord = []
	j = 0
	for i in xrange(0, lambda_len):
		x_coord.append(i + 1)
	
	ys = []
	for x in xs:
		ys.append((x[1]['~lambda'], x[2]))

	if sortDists: ys.sort(cmp=lambda x,y: -cmp(x[0][lambda_len - 1], y[0][lambda_len - 1]))
	
	if plotMu: plotDistribution(x_coord, ys, yTitle = "marginal success rate (lambda)", xTitle = "guesses per question (beta)") 

	precision = "%.1f"

	endSpacing = "\lts"

	stringIndex = 0

	#print LaTeX table for inclusion in paper
	for i in xrange(len(xs)):

		x = xs[i]
		while(stringIndex < len(strings) and strings[stringIndex][0] == i):
			print strings[stringIndex][1]
			stringIndex += 1

		print str(x[1]['title'] + " & " + precision%x[1]['H0'] + " & " + precision%x[1]['H1'] + " & " + precision%x[1]['~G'] + " & " + precision%x[1]['H2']   + " & " + precision%x[1]['~mu'][int(0.5/x[1]['alphaStep']) - 1] + " & " + precision%x[1]['~lambda'][lambdaPrint - 1] + " & " + precision%x[1]['Hmin'] + " & " + endSpacing + " " + x[1]['x0']),
		print "\\\\\n",


#-------------------------
#Main function
#-------------------------


if len(sys.argv) < 1:
	print "usage entropyStats.py [files...]"


plotMu = True
numR = -1
plotShapes = (numR < 0)

rs = []
for i in xrange(numR + 1):
	rs.append(i / float(numR))

shapeStats = []
shapeStats.append(('$\\tilde{\\mu}_{\\frac{1}{2}}$', lambda x: x[1]['~mu'][int(0.5/x[1]['alphaStep']) - 1]))
shapeStats.append(('$\\tilde{\\lambda}_3$', lambda x: x[1]['~lambda'][2]))

gs = []
extraTableStrings = []

xs = []

for f in range(1, len(sys.argv)):

	#Any string starting with a '*' will be an extra string for printing the table
	if sys.argv[f][0] == '*':
		next = sys.argv[f][1:]
		extraTableStrings.append((len(xs), next)) 
		continue

	dists = readFile(sys.argv[f], shapeR = rs)
	xs.extend(dists)

	#Add shaped distributions
	for j in xrange(len(shapeStats)):	
		g = []

		label = str(sys.argv[f])
		if label in titles: label = titles[label]

		for i in xrange(numR + 1):
			g.append(shapeStats[j][1](dists[1 + i]))

		gs.append((g, "\\vphantom{$\\tilde{\\mu}_{\\frac{1}{2}}$ ()}" + shapeStats[j][0] + '\\ (' + label + ')' ))

printStats(xs, plotMu, strings = extraTableStrings)

if plotShapes:
	plotDistribution(rs, gs, xTitle = "Rejection rate\\ $r_*$", yTitle = "Effective security (bits)", colors = ('b', 'b', 'r', 'r', 'm', 'y'), marks = ('', '', '', '', 's', 's', 'o', 'o', '>', 's', 'p', '*', 'h', '+', 'x', 'D'), lineStyles = ('-', '--'), saveLoc = "shape_output.pdf", showPresetData = False)