matching = require '../lib/matching' scoring = require '../lib/scoring' fs = require 'fs' byline = require 'byline' sprintf = require('sprintf-js').sprintf check_usage = () -> usage = ''' Run a frequency count on the raw 10M xato password set and keep counts over CUTOFF in descending frequency. That file can be found by googling around for: "xato 10-million-combos.txt" Passwords that both: -- fully match according to zxcvbn's date, year, repeat, sequence or keyboard matching algs -- have a higher rank than the corresponding match guess number are excluded from the final password set, since zxcvbn would score them lower through other means anyhow. in practice this rules out dates and years most often and makes room for more useful data. To use, first run from zxcvbn base dir: npm run build then change into data-scripts directory and run: coffee count_xato.coffee --nodejs xato_file.txt ../data/passwords.txt ''' valid = process.argv.length == 5 valid = valid and process.argv[0] == 'coffee' and process.argv[2] in ['--nodejs', '-n'] valid = valid and __dirname.split('/').slice(-1)[0] == 'data-scripts' unless valid console.log usage process.exit(0) # after all passwords are counted, discard pws with counts <= COUNTS CUTOFF = 10 # to save memory, after every batch of size BATCH_SIZE, go through counts and delete # long tail of entries with only one count. BATCH_SIZE = 1000000 counts = {} # maps pw -> count skipped_lines = 0 # skipped lines in xato file -- lines w/o two tokens line_count = 0 # current number of lines processed normalize = (token) -> token.toLowerCase() should_include = (password, xato_rank) -> for i in [0...password.length] if password.charCodeAt(i) > 127 # xato mostly contains ascii-only passwords, so in practice # this will only skip one or two top passwords over the cutoff. # were that not the case / were this used on a different data source, consider using # a unidecode-like library instead, similar to count_wikipedia / count_wiktionary console.log "SKIPPING non-ascii password=#{password}, rank=#{xato_rank}" return false matches = [] for matcher in [ matching.spatial_match matching.repeat_match matching.sequence_match matching.regex_match matching.date_match ] matches.push.apply matches, matcher.call(matching, password) matches = matches.filter (match) -> # only keep matches that span full password match.i == 0 and match.j == password.length - 1 for match in matches if scoring.estimate_guesses(match, password) < xato_rank # filter out this entry: non-dictionary matching will assign # a lower guess estimate. return false return true prune = (counts) -> for pw, count of counts if count == 1 delete counts[pw] main = (xato_filename, output_filename) -> stream = byline.createStream fs.createReadStream(xato_filename, encoding: 'utf8') stream.on 'readable', -> while null != (line = stream.read()) line_count += 1 if line_count % BATCH_SIZE == 0 console.log 'counting tokens:', line_count prune counts tokens = line.trim().split /\s+/ unless tokens.length == 2 skipped_lines += 1 continue [username, password] = tokens[..1] password = normalize password if password of counts counts[password] += 1 else counts[password] = 1 stream.on 'end', -> console.log 'skipped lines:', skipped_lines pairs = [] console.log 'copying to tuples' for pw, count of counts if count > CUTOFF pairs.push [pw, count] delete counts[pw] # save memory to avoid v8 1GB limit console.log 'sorting' pairs.sort (p1, p2) -> # sort by count. higher counts go first. p2[1] - p1[1] console.log 'filtering' pairs = pairs.filter (pair, i) -> rank = i + 1 [pw, count] = pair should_include pw, rank output_stream = fs.createWriteStream output_filename, encoding: 'utf8' for pair in pairs [pw, count] = pair output_stream.write sprintf("%-15s %d\n", pw, count) output_stream.end() check_usage() main process.argv[3], process.argv[4]