PRCYCoin  2.0.0.7rc1
P2P Digital Currency
matching.cpp
Go to the documentation of this file.
1 #include <zxcvbn/matching.hpp>
2 
4 #include <zxcvbn/common.hpp>
5 #include <zxcvbn/optional.hpp>
7 #include <zxcvbn/scoring.hpp>
8 #include <zxcvbn/util.hpp>
9 
10 #include <algorithm>
11 #include <array>
12 #include <functional>
13 #include <initializer_list>
14 #include <regex>
15 #include <sstream>
16 #include <string>
17 #include <vector>
18 #include <unordered_map>
19 #include <utility>
20 #include <unordered_set>
21 
22 namespace zxcvbn {
23 
24 // TODO: make this a constexpr
25 extern const std::vector<std::pair<std::string, std::vector<std::string>>> L33T_TABLE = {
26  {"a", {"4", "@"}},
27  {"b", {"8"}},
28  {"c", {"(", "{", "[", "<"}},
29  {"e", {"3"}},
30  {"g", {"6", "9"}},
31  {"i", {"1", "!", "|"}},
32  {"l", {"1", "|", "7"}},
33  {"o", {"0"}},
34  {"s", {"$", "5"}},
35  {"t", {"+", "7"}},
36  {"x", {"%"}},
37  {"z", {"2"}},
38 };
39 
40 // TODO: make this constexpr
41 extern const std::vector<std::pair<RegexTag, std::regex>> REGEXEN = {
42  {RegexTag::RECENT_YEAR, std::regex(R"(19\d\d|200\d|201\d)")},
43 };
44 
45 const auto DATE_MAX_YEAR = 2050;
46 const auto DATE_MIN_YEAR = 1000;
47 constexpr std::initializer_list<std::pair<int, int>> DATE_SPLITS[] = {
48  { // for length-4 strings, eg 1191 or 9111, two ways to split:
49  {1, 2}, // 1 1 91 (2nd split starts at index 1, 3rd at index 2)
50  {2, 3}, // 91 1 1
51  },
52  {
53  {1, 3}, // 1 11 91
54  {2, 3}, // 11 1 91
55  },
56  {
57  {1, 2}, // 1 1 1991
58  {2, 4}, // 11 11 91
59  {4, 5}, // 1991 1 1
60  },
61  {
62  {1, 3}, // 1 11 1991
63  {2, 3}, // 11 1 1991
64  {4, 5}, // 1991 1 11
65  {4, 6}, // 1991 11 1
66  },
67  {
68  {2, 4}, // 11 11 1991
69  {4, 6}, // 1991 11 11
70  },
71 };
72 
73 static
74 std::string translate(const std::string & string,
75  const std::unordered_map<std::string, std::string> & chr_map) {
76  std::string toret;
77  auto bit = std::back_inserter(toret);
78  toret.reserve(string.size());
79  for (auto it = string.begin(); it != string.end();) {
80  auto nextit = util::utf8_iter(it, string.end());
81  auto ch = std::string(it, nextit);
82  auto mit = chr_map.find(ch);
83  if (mit != chr_map.end()) {
84  ch = mit->second;
85  }
86  std::copy(ch.begin(), ch.end(), bit);
87  it = nextit;
88  }
89  return toret;
90 }
91 
92 static
93 std::vector<Match> & sorted(std::vector<Match> & matches) {
94  std::sort(matches.begin(), matches.end(),
95  [&] (const Match & m1, const Match & m2) -> bool {
96  return std::make_pair(m1.i, m1.j) < std::make_pair(m2.i, m2.j);
97  });
98  return matches;
99 }
100 
101 static
102 std::string dict_normalize(const std::string & str) {
103  // NB: we only have ascii strings in the dictionaries
104  // TODO: when we have more complex strings in the dictionaries,
105  // do a more complex normalization
106  return util::ascii_lower(str);
107 }
108 
109 std::vector<Match> omnimatch(const std::string & password,
110  const std::vector<std::string> & ordered_list) {
111  auto ranked_dictionaries = default_ranked_dicts();
112 
113  auto ranked_dict = build_ranked_dict(ordered_list);
114  ranked_dictionaries.insert(std::make_pair(DictionaryTag::USER_INPUTS,
115  std::cref(ranked_dict)));
116 
117  std::vector<Match> matches;
118  std::function<std::vector<Match>(const std::string &)> matchers[] = {
119  std::bind(dictionary_match, std::placeholders::_1,
120  std::cref(ranked_dictionaries)),
121  std::bind(reverse_dictionary_match, std::placeholders::_1,
122  std::cref(ranked_dictionaries)),
123  std::bind(l33t_match, std::placeholders::_1,
124  std::cref(ranked_dictionaries), std::cref(L33T_TABLE)),
125  std::bind(spatial_match, std::placeholders::_1,
126  std::cref(graphs())),
127  repeat_match,
129  std::bind(regex_match, std::placeholders::_1, std::cref(REGEXEN)),
130  date_match,
131  };
132  for (const auto & matcher : matchers) {
133  auto ret = matcher(password);
134  std::move(ret.begin(), ret.end(), std::back_inserter(matches));
135  }
136  return sorted(matches);
137 }
138 
139 //-------------------------------------------------------------------------------
140 // dictionary match (common passwords, english, last names, etc) ----------------
141 //-------------------------------------------------------------------------------
142 
143 std::vector<Match> dictionary_match(const std::string & password,
144  const RankedDicts & ranked_dictionaries) {
145  std::vector<Match> matches;
146  auto len = password.length();
147  auto password_lower = dict_normalize(password);
148  for (const auto & item : ranked_dictionaries) {
149  auto dictionary_tag = item.first;
150  auto & ranked_dict = item.second;
151  for (decltype(len) i = 0, idx = 0; idx < len; util::utf8_decode(password, idx), ++i) {
152  for (decltype(len) j = i, jdx = idx; jdx < len; ++j) {
153  // j is inclusive, but jdx is not so eagerly iterate jdx
154  util::utf8_decode(password, jdx);
155 
156  auto word = password_lower.substr(idx, jdx - idx);
157  auto it = ranked_dict.find(word);
158  if (it != ranked_dict.end()) {
159  auto rank = it->second;
160  matches.push_back(Match(i, j, password.substr(idx, jdx - idx),
162  dictionary_tag,
163  word, rank,
164  false,
165  false, {}, ""}));
166  matches.back().idx = idx;
167  matches.back().jdx = jdx;
168  }
169  }
170  }
171  }
172  return sorted(matches);
173 }
174 
175 std::vector<Match> reverse_dictionary_match(const std::string & password,
176  const RankedDicts & ranked_dictionaries) {
177  auto clen = util::character_len(password);
178  auto reversed_password = util::reverse_string(password);
179  auto matches = dictionary_match(reversed_password, ranked_dictionaries);
180  for (auto & match : matches) {
181  match.token = util::reverse_string(match.token); // reverse back
182  match.get_dictionary().reversed = true;
183  // map coordinates back to original string
184  std::tie(match.i, match.j) = std::make_tuple(
185  clen - 1 - match.j,
186  clen - 1 - match.i
187  );
188  std::tie(match.idx, match.jdx) = std::make_tuple(
189  password.length() - match.jdx,
190  password.length() - match.idx
191  );
192  }
193  return sorted(matches);
194 }
195 
196 //-------------------------------------------------------------------------------
197 // dictionary match with common l33t substitutions ------------------------------
198 //-------------------------------------------------------------------------------
199 
200 // makes a pruned copy of l33t_table that only includes password's possible substitutions
201 std::unordered_map<std::string, std::vector<std::string>> relevant_l33t_subtable(const std::string & password, const std::vector<std::pair<std::string, std::vector<std::string>>> & table) {
202  std::unordered_map<std::string, std::vector<std::string>> subtable;
203  for (const auto & item : table) {
204  auto & letter = item.first;
205  auto & subs = item.second;
206  std::vector<std::string> relevant_subs;
207  for (const auto & sub : subs) {
208  if (password.find(sub) != password.npos) {
209  relevant_subs.push_back(sub);
210  }
211  }
212 
213  if (relevant_subs.size()) {
214  subtable.insert(std::make_pair(letter, relevant_subs));
215  }
216  }
217  return subtable;
218 }
219 
220 // returns the list of possible 1337 replacement dictionaries for a given password
221 std::vector<std::unordered_map<std::string, std::string>> enumerate_l33t_subs(const std::unordered_map<std::string, std::vector<std::string>> & table) {
222  using SubsType = std::vector<std::vector<std::pair<std::string, std::string>>>;
223  SubsType subs = {{}};
224 
225  auto dedup = [] (const SubsType & subs) {
226  SubsType deduped;
227  std::unordered_set<std::string> members;
228  for (const auto & sub : subs) {
229  auto assoc = sub;
230  std::sort(assoc.begin(), assoc.end());
231  std::ostringstream label_stream;
232  for (const auto & item : assoc) {
233  label_stream << item.first << "," << item.second << "-";
234  }
235  auto label = label_stream.str();
236  if (members.find(label) == members.end()) {
237  members.insert(std::move(label));
238  deduped.push_back(sub);
239  }
240  }
241  return deduped;
242  };
243 
244  for (const auto & item : table) {
245  auto & first_key = item.first;
246  SubsType next_subs;
247  for (const auto & l33t_chr : item.second) {
248  for (const auto & sub : subs) {
249  auto sub_alternative = sub;
250  auto it = std::find_if(
251  sub_alternative.begin(), sub_alternative.end(),
252  [&] (const std::pair<std::string, std::string> & sub_elt) {
253  return sub_elt.first == l33t_chr;
254  });
255  if (it == sub_alternative.end()) {
256  sub_alternative.push_back(std::make_pair(l33t_chr, first_key));
257  next_subs.push_back(std::move(sub_alternative));
258  }
259  else {
260  sub_alternative.erase(it);
261  sub_alternative.push_back(std::make_pair(l33t_chr, first_key));
262  next_subs.push_back(sub);
263  next_subs.push_back(std::move(sub_alternative));
264  }
265  }
266  }
267  subs = dedup(next_subs);
268  }
269 
270  // convert from assoc lists to dicts
271  std::vector<std::unordered_map<std::string, std::string>> sub_dicts;
272  for (const auto & sub : subs) {
273  sub_dicts.push_back(std::unordered_map<std::string, std::string>
274  (sub.begin(), sub.end()));
275  }
276 
277  return sub_dicts;
278 }
279 
280 std::vector<Match> l33t_match(const std::string & password,
281  const RankedDicts & ranked_dictionaries,
282  const std::vector<std::pair<std::string, std::vector<std::string>>> & l33t_table) {
283  std::vector<Match> matches;
284  for (const auto & sub : enumerate_l33t_subs(relevant_l33t_subtable(password, l33t_table))) {
285  if (!sub.size()) break;
286  auto subbed_password = translate(password, sub);
287  for (auto & match : dictionary_match(subbed_password, ranked_dictionaries)) {
288  auto & dmatch = match.get_dictionary();
289  auto token = password.substr(match.idx, match.jdx - match.idx);
290  if (dict_normalize(token) == dmatch.matched_word) {
291  // only return the matches that contain an actual substitution
292  continue;
293  }
294  // subset of mappings in sub that are in use for this match
295  std::unordered_map<std::string, std::string> match_sub;
296  for (const auto & item : sub) {
297  auto & subbed_chr = item.first;
298  if (token.find(subbed_chr) == token.npos) continue;
299  match_sub.insert(item);
300  }
301  dmatch.l33t = true;
302  match.token = token;
303  dmatch.sub = match_sub;
304  std::ostringstream os;
305  std::string sep = "";
306  for (const auto & item : match_sub) {
307  os << sep << item.first << " -> " << item.second;
308  if (!sep.size()) {
309  sep = ", ";
310  }
311  }
312  dmatch.sub_display = os.str();
313  matches.push_back(std::move(match));
314  }
315  }
316 
317  matches.erase(std::remove_if(matches.begin(), matches.end(), [] (const Match & a) {
318  // filter single-character l33t matches to reduce noise.
319  // otherwise '1' matches 'i', '4' matches 'a', both very common English words
320  // with low dictionary rank.
321  return util::character_len(a.token) <= 1;
322  }),
323  matches.end());
324 
325  return sorted(matches);
326 }
327 
328 // ------------------------------------------------------------------------------
329 // spatial match (qwerty/dvorak/keypad) -----------------------------------------
330 // ------------------------------------------------------------------------------
331 
332 static
333 std::vector<Match> spatial_match_helper(const std::string & password,
334  const Graph & graph,
335  GraphTag tag);
336 
337 std::vector<Match> spatial_match(const std::string & password,
338  const Graphs & graphs) {
339  std::vector<Match> matches;
340  for (const auto & item : graphs) {
341  auto ret = spatial_match_helper(password, item.second, item.first);
342  std::move(ret.begin(), ret.end(), std::back_inserter(matches));
343  }
344  return matches;
345 }
346 
347 const auto SHIFTED_RX = std::regex("[~!@#$%^&*()_+QWERTYUIOP{}|ASDFGHJKL:\"ZXCVBNM<>?]");
348 
349 static
350 std::vector<Match> spatial_match_helper(const std::string & password,
351  const Graph & graph,
352  GraphTag graph_tag) {
353  std::vector<Match> matches;
354  if (!password.length()) return matches;
355  idx_t idx = 0;
356  idx_t i = 0;
357  auto clen = util::character_len(password);
358  while (i < clen - 1) {
359  auto jdx = idx;
360  util::utf8_decode(password, jdx);
361  auto j = i + 1;
362  auto last_direction = -1;
363  unsigned turns = 0;
364  unsigned shifted_count;
365  if ((graph_tag == GraphTag::QWERTY ||
366  graph_tag == GraphTag::DVORAK) &&
367  std::regex_search(password.substr(idx, jdx - idx), SHIFTED_RX)) {
368  shifted_count = 1;
369  }
370  else {
371  shifted_count = 0;
372  }
373  auto prev_jdx = idx;
374  while (true) {
375  auto prev_char = password.substr(prev_jdx, jdx - prev_jdx);
376  auto found = false;
377  auto found_direction = -1;
378  auto cur_direction = -1;
379  const auto & adjacents = [&] {
380  auto it = graph.find(prev_char);
381  if (it != graph.end()) {
382  return it->second;
383  }
384  return std::vector<optional::optional<std::string>>();
385  }();
386  // consider growing pattern by one character if j hasn't gone over the edge.
387  if (j < clen) {
388  auto next_jdx = jdx;
389  util::utf8_decode(password, next_jdx);
390  auto cur_char = password.substr(jdx, next_jdx - jdx);
391  for (auto & adj : adjacents) {
392  cur_direction += 1;
393  if (adj && adj->find(cur_char) != adj->npos) {
394  found = true;
395  found_direction = cur_direction;
396  if (adj->find(cur_char) == 1) {
397  // index 1 in the adjacency means the key is shifted,
398  // 0 means unshifted: A vs a, % vs 5, etc.
399  // for example, 'q' is adjacent to the entry '2@'.
400  // @ is shifted w/ index 1, 2 is unshifted.
401  shifted_count += 1;
402  }
403  if (last_direction != found_direction) {
404  // adding a turn is correct even in the initial case when last_direction is null:
405  // every spatial pattern starts with a turn.
406  turns += 1;
407  last_direction = found_direction;
408  }
409  break;
410  }
411  }
412  }
413  // if the current pattern continued, extend j and try to grow again
414  if (found) {
415  j += 1;
416  prev_jdx = jdx;
417  util::utf8_decode(password, jdx);
418  }
419  // otherwise push the pattern discovered so far, if any...
420  else {
421  if (j - i > 2) { // don't consider length 1 or 2 chains.
422  matches.push_back(Match(i, j - 1, password.substr(idx, jdx - idx),
423  SpatialMatch{
424  graph_tag, turns, shifted_count,
425  }));
426  matches.back().idx = idx;
427  matches.back().jdx = jdx;
428  }
429  // ...and then start a new search for the rest of the password.
430  i = j;
431  idx = jdx;
432  break;
433  }
434  }
435  }
436  return matches;
437 }
438 
439 //-------------------------------------------------------------------------------
440 // repeats (aaa, abcabcabc) and sequences (abcdef) ------------------------------
441 //-------------------------------------------------------------------------------
442 
443 std::vector<Match> repeat_match(const std::string & password) {
444  std::vector<Match> matches;
445  std::regex greedy(R"((.+)\1+)");
446  std::regex lazy(R"((.+?)\1+)");
447  std::regex lazy_anchored(R"(^(.+?)\1+$)");
448  idx_t lastIndex = 0;
449  while (lastIndex < password.length()) {
450  auto start_iter = lastIndex + password.begin();
451  std::smatch greedy_match, lazy_match;
452  std::regex_search(start_iter, password.end(),
453  greedy_match, greedy);
454  std::regex_search(start_iter, password.end(),
455  lazy_match, lazy);
456  if (!greedy_match.size()) break;
457  std::smatch match;
458  std::string base_token;
459  if (greedy_match[0].length() > lazy_match[0].length()) {
460  // greedy beats lazy for 'aabaab'
461  // greedy: [aabaab, aab]
462  // lazy: [aa, a]
463  match = greedy_match;
464  // greedy's repeated string might itself be repeated, eg.
465  // aabaab in aabaabaabaab.
466  // run an anchored lazy match on greedy's repeated string
467  // to find the shortest repeated string
468  std::smatch lazy_anchored_match;
469  auto greedy_found = match.str(0);
470  auto ret = std::regex_search(greedy_found, lazy_anchored_match, lazy_anchored);
471  assert(ret);
472  (void) ret;
473  base_token = lazy_anchored_match.str(1);
474  }
475  else {
476  // lazy beats greedy for 'aaaaa'
477  // greedy: [aaaa, aa]
478  // lazy: [aaaaa, a]
479  match = std::move(lazy_match);
480  base_token = match.str(1);
481  }
482  auto idx = lastIndex + match.position();
483  auto jdx = lastIndex + match.position() + match[0].length();
484  auto i = util::character_len(password, 0, idx);
485  auto j = i + util::character_len(password, idx, jdx) - 1;
486  // recursively match and score the base string
487  auto sub_matches = omnimatch(base_token);
488  auto base_analysis = most_guessable_match_sequence(
489  base_token,
490  sub_matches,
491  false
492  );
493  std::vector<Match> base_matches;
494  std::move(base_analysis.sequence.begin(), base_analysis.sequence.end(),
495  std::back_inserter(base_matches));
496  auto & base_guesses = base_analysis.guesses;
497  matches.push_back(Match(i, j, match.str(0),
498  RepeatMatch{
499  base_token,
500  base_guesses,
501  std::move(base_matches),
502  match[0].length() / base_token.length(),
503  }));
504  matches.back().idx = idx;
505  matches.back().jdx = jdx;
506  lastIndex = jdx;
507  }
508  return matches;
509 }
510 
511 const auto MAX_DELTA = 5;
512 std::vector<Match> sequence_match(const std::string & password) {
513  // Identifies sequences by looking for repeated differences in unicode codepoint.
514  // this allows skipping, such as 9753, and also matches some extended unicode sequences
515  // such as Greek and Cyrillic alphabets.
516  //
517  // for example, consider the input 'abcdb975zy'
518  //
519  // password: a b c d b 9 7 5 z y
520  // index: 0 1 2 3 4 5 6 7 8 9
521  // delta: 1 1 1 -2 -41 -2 -2 69 1
522  //
523  // expected result:
524  // [(i, j, delta), ...] = [(0, 3, 1), (5, 7, -2), (8, 9, 1)]
525 
526  if (util::character_len(password) == 1) return {};
527 
528  std::vector<Match> result;
529 
530  using delta_t = std::int32_t;
531 
532  auto update = [&] (idx_t i, idx_t j, idx_t idx, idx_t jdx, delta_t delta) {
533  if (j - i > 1 || std::abs(delta) == 1) {
534  if (0 < std::abs(delta) && std::abs(delta) <= MAX_DELTA) {
535  auto token = password.substr(idx, jdx - idx);
536  SequenceTag sequence_name;
537  unsigned sequence_space;
538  if (std::regex_search(token, std::regex(R"(^[a-z]+$)"))) {
539  sequence_name = SequenceTag::LOWER;
540  sequence_space = 26;
541  }
542  else if (std::regex_search(token, std::regex(R"(^[A-Z]+$)"))) {
543  sequence_name = SequenceTag::UPPER;
544  sequence_space = 26;
545  }
546  else if (std::regex_search(token, std::regex(R"(^\d+$)"))) {
547  sequence_name = SequenceTag::DIGITS;
548  sequence_space = 10;
549  }
550  else {
551  sequence_name = SequenceTag::UNICODE;
552  sequence_space = 26;
553  }
554  result.push_back(Match(i, j, token,
555  SequenceMatch{sequence_name, sequence_space,
556  delta > 0}));
557  result.back().idx = idx;
558  result.back().jdx = jdx;
559  }
560  }
561  };
562 
563  if (!password.size()) return result;
564 
565  decltype(password.length()) i = 0;
566  decltype(password.length()) idx = 0;
567  optional::optional<delta_t> maybe_last_delta;
568  decltype(password.length()) kdx = 0;
569  auto last_kdx = kdx;
570  auto last_cp = util::utf8_decode(password, kdx);
571  for (idx_t k = 1; kdx < password.length(); ++k) {
572  auto next_kdx = kdx;
573  auto cp = util::utf8_decode(password, next_kdx);
574  assert(kdx != next_kdx);
575  delta_t delta = cp - last_cp;
576  if (!maybe_last_delta) {
577  maybe_last_delta = delta;
578  }
579  if (delta != *maybe_last_delta) {
580  auto jdx = kdx;
581  auto j = k - 1;
582  update(i, j, idx, jdx, *maybe_last_delta);
583  i = j;
584  idx = last_kdx;
585  maybe_last_delta = delta;
586  }
587  last_kdx = kdx;
588  kdx = next_kdx;
589  last_cp = cp;
590  }
591  if (maybe_last_delta) {
592  update(i, util::character_len(password) - 1,
593  idx, password.size(), *maybe_last_delta);
594  }
595  return result;
596 }
597 
598 
599 //-------------------------------------------------------------------------------
600 // regex matching ---------------------------------------------------------------
601 //-------------------------------------------------------------------------------
602 
603 std::vector<Match> regex_match(const std::string & password,
604  const std::vector<std::pair<RegexTag, std::regex>> & regexen) {
605  std::vector<Match> matches;
606  for (const auto & item : regexen) {
607  auto tag = item.first;
608  auto & regex = item.second;
609  std::smatch rx_match;
610  std::size_t lastIndex = 0;
611  while (std::regex_match(lastIndex + password.begin(), password.end(),
612  rx_match, regex)) {
613  auto token = rx_match.str(0);
614  auto idx = lastIndex + rx_match.position();
615  auto jdx = lastIndex + rx_match.position() + rx_match[0].length();
616  assert(token == password.substr(idx, jdx - idx));
617  auto i = util::character_len(password, 0, idx);
618  auto j = i + util::character_len(password, idx, jdx) - 1;
619  matches.push_back(Match(i, j,
620  std::move(token),
621  RegexMatch{tag, PortableRegexMatch(rx_match)}));
622  matches.back().idx = idx;
623  matches.back().jdx = jdx;
624  lastIndex += rx_match[0].length();
625  }
626  }
627  return sorted(matches);
628 }
629 
630 //-------------------------------------------------------------------------------
631 // date matching ----------------------------------------------------------------
632 //-------------------------------------------------------------------------------
633 
634 using date_t = unsigned;
635 
636 struct DMY {
637  date_t year, month, day;
638 };
639 
640 static
641 optional::optional<DMY> map_ints_to_dmy(const std::array<date_t, 3> & vals);
642 
643 static
644 date_t stou(const std::string & a) {
645  return static_cast<date_t>(std::stoul(a));
646 }
647 
648 std::vector<Match> date_match(const std::string & password) {
649  // a "date" is recognized as:
650  // any 3-tuple that starts or ends with a 2- or 4-digit year,
651  // with 2 or 0 separator chars (1.1.91 or 1191),
652  // maybe zero-padded (01-01-91 vs 1-1-91),
653  // a month between 1 and 12,
654  // a day between 1 and 31.
655  //
656  // note: this isn't true date parsing in that "feb 31st" is allowed,
657  // this doesn't check for leap years, etc.
658  //
659  // recipe:
660  // start with regex to find maybe-dates, then attempt to map the integers
661  // onto month-day-year to filter the maybe-dates into dates.
662  // finally, remove matches that are substrings of other matches to reduce noise.
663  //
664  // note: instead of using a lazy or greedy regex to find many dates over the full string,
665  // this uses a ^...$ regex against every substring of the password -- less performant but leads
666  // to every possible date match.
667  std::vector<Match> matches;
668  std::regex maybe_date_no_separator(R"(^\d{4,8}$)");
669  std::regex maybe_date_with_separator(R"(^(\d{1,4})([\s/\\_.-])(\d{1,2})\2(\d{1,4})$)");
670 
671  // dates without separators are between length 4 '1191' and 8 '11111991'
672  std::vector<std::string::size_type> offsets;
673  offsets.reserve(9);
674  idx_t idx_dot = 0;
675  for (auto i = 0; i < 9; ++i) {
676  offsets.push_back(idx_dot);
677  if (idx_dot >= password.length()) {
678  break;
679  }
680  util::utf8_decode(password, idx_dot);
681  }
682  assert(offsets.size());
683  for (decltype(password.length()) i = 0; offsets.size() - 1 >= 4; ++i) {
684  auto idx = offsets[0];
685  for (decltype(i) offset = 3; offset <= 7 && offset < offsets.size() - 1; ++offset) {
686  auto j = i + offset;
687  auto jdx = offsets[offset + 1];
688 
689  auto token = password.substr(idx, jdx - idx);
690  auto token_chr_len = j - i + 1;
691  assert(util::character_len(token) == token_chr_len);
692  if (!std::regex_search(token, maybe_date_no_separator)) continue;
693  std::vector<DMY> candidates;
694  for (const auto & item : DATE_SPLITS[token_chr_len - 4]) {
695  auto k = item.first;
696  auto l = item.second;
697  auto kdx = offsets[k] - idx;
698  auto ldx = offsets[l] - idx;
699 
700  auto dmy = map_ints_to_dmy(std::array<date_t, 3>{{
701  stou(token.substr(0, kdx)),
702  stou(token.substr(kdx, ldx - kdx)),
703  stou(token.substr(ldx))}});
704  if (dmy) candidates.push_back(*dmy);
705  }
706  if (!candidates.size()) continue;
707  // at this point: different possible dmy mappings for the same i,j substring.
708  // match the candidate date that likely takes the fewest guesses: a year closest to 2000.
709  // (scoring.REFERENCE_YEAR).
710  //
711  // ie, considering '111504', prefer 11-15-04 to 1-1-1504
712  // (interpreting '04' as 2004)
713  auto metric = [] (const DMY & candidate) {
714  if (candidate.year >= REFERENCE_YEAR) {
715  return candidate.year - REFERENCE_YEAR;
716  }
717  else {
718  return REFERENCE_YEAR - candidate.year;
719  }
720  };
721  auto best_candidate = *std::min_element(candidates.begin(), candidates.end(),
722  [=] (const DMY & a, const DMY & b) {
723  return metric(a) < metric(b);
724  });
725  matches.push_back(Match(i, j, token,
726  DateMatch{"",
727  best_candidate.year,
728  best_candidate.month,
729  best_candidate.day,
730  false,
731  }));
732  matches.back().idx = idx;
733  matches.back().jdx = jdx;
734  }
735 
736  offsets.erase(offsets.begin());
737  if (offsets.back() < password.length()) {
738  auto idx2 = offsets.back();
739  util::utf8_decode(password, idx2);
740  offsets.push_back(idx2);
741  }
742  }
743 
744  // dates with separators are between length 6 '1/1/91' and 10 '11/11/1991'
745  offsets.clear();
746  offsets.reserve(11);
747  idx_dot = 0;
748  for (auto i = 0; i < 11; ++i) {
749  offsets.push_back(idx_dot);
750  if (idx_dot >= password.length()) {
751  break;
752  }
753  util::utf8_decode(password, idx_dot);
754  }
755  assert(offsets.size());
756  for (decltype(password.length()) i = 0; offsets.size() - 1 >= 6; ++i) {
757  auto idx = offsets[0];
758  for (decltype(password.length()) offset = 5; offset <= 9 && offset < offsets.size() - 1; ++offset) {
759  auto j = offset + i;
760  auto jdx = offsets[offset + 1];
761  auto token = password.substr(idx, jdx - idx);
762  std::smatch rx_match;
763  if (!std::regex_match(token, rx_match, maybe_date_with_separator)) {
764  continue;
765  }
766  auto dmy = map_ints_to_dmy(std::array<date_t, 3>{{
767  stou(rx_match[1]),
768  stou(rx_match[3]),
769  stou(rx_match[4])}});
770  if (!dmy) continue;
771  matches.push_back(Match(i, j, token,
772  DateMatch{rx_match[2],
773  dmy->year,
774  dmy->month,
775  dmy->day,
776  false,
777  }));
778  matches.back().idx = idx;
779  matches.back().jdx = jdx;
780  }
781 
782  offsets.erase(offsets.begin());
783  if (offsets.back() < password.length()) {
784  auto idx2 = offsets.back();
785  util::utf8_decode(password, idx2);
786  offsets.push_back(idx2);
787  }
788  }
789 
790  // matches now contains all valid date strings in a way that is tricky to capture
791  // with regexes only. while thorough, it will contain some unintuitive noise:
792  //
793  // '2015_06_04', in addition to matching 2015_06_04, will also contain
794  // 5(!) other date matches: 15_06_04, 5_06_04, ..., even 2015 (matched as 5/1/2020)
795  //
796  // to reduce noise, remove date matches that are strict substrings of others
797  matches.erase(std::remove_if(matches.begin(), matches.end(), [&] (const Match & match) {
798  for (auto & other_match : matches) {
799  if (other_match.i == match.i && other_match.j == match.j) continue;
800  if (other_match.i <= match.i && other_match.j >= match.j) {
801  return true;
802  }
803  }
804  return false;
805  }),
806  matches.end());
807 
808  return sorted(matches);
809 }
810 
811 static
812 optional::optional<DMY> map_ints_to_dm(const std::array<date_t, 2> & vals);
813 
814 static
815 date_t two_to_four_digit_year(date_t val);
816 
817 static
818 optional::optional<DMY> map_ints_to_dmy(const std::array<date_t, 3> & vals) {
819  // given a 3-tuple, discard if:
820  // middle int is over 31 (for all dmy formats, years are never allowed in the middle)
821  // middle int is zero
822  // any int is over the max allowable year
823  // any int is over two digits but under the min allowable year
824  // 2 ints are over 31, the max allowable day
825  // 2 ints are zero
826  // all ints are over 12, the max allowable month
827  if (vals[1] > 31 || vals[1] == 0) return optional::nullopt;
828  auto over_12 = 0;
829  auto over_31 = 0;
830  auto under_1 = 0;
831  for (auto val : vals) {
832  if ((99 < val && val < DATE_MIN_YEAR) || val > DATE_MAX_YEAR) return optional::nullopt;
833  if (val > 31) over_31 += 1;
834  if (val > 12) over_12 += 1;
835  if (val <= 0) under_1 += 1;
836  }
837  if (over_31 >= 2 || over_12 == 3 || under_1 >= 2) return optional::nullopt;
838 
839  // first look for a four digit year: yyyy + daymonth or daymonth + yyyy
840  std::pair<date_t, std::array<date_t, 2>> possible_year_splits[] = {
841  {vals[2], {{vals[0], vals[1]}}}, // year last
842  {vals[0], {{vals[1], vals[2]}}}, // year first
843  };
844  for (const auto & item : possible_year_splits) {
845  auto & y = item.first;
846  auto & rest = item.second;
847  if (DATE_MIN_YEAR <= y && y <= DATE_MAX_YEAR) {
848  auto dm = map_ints_to_dm(rest);
849  if (dm) {
850  return DMY{y, dm->month, dm->day};
851  }
852  else {
853  // for a candidate that includes a four-digit year,
854  // when the remaining ints don't match to a day and month,
855  // it is not a date.
856  return optional::nullopt;
857  }
858  }
859  }
860 
861  // given no four-digit year, two digit years are the most flexible int to match, so
862  // try to parse a day-month out of ints[0..1] or ints[1..0]
863  for (const auto & item : possible_year_splits) {
864  auto y = item.first;
865  auto & rest = item.second;
866  auto dm = map_ints_to_dm(rest);
867  if (dm) {
868  y = two_to_four_digit_year(y);
869  return DMY{y, dm->month, dm->day};
870  }
871  }
872 
873  return optional::nullopt;
874 }
875 
876 static
877 optional::optional<DMY> map_ints_to_dm(const std::array<date_t, 2> & vals) {
878  for (const auto & item : {vals, {{vals[1], vals[0]}}}) {
879  auto d = item[0], m = item[1];
880  if (1 <= d && d <= 31 && 1 <= m && m <= 12) {
881  return DMY{0, m, d};
882  }
883  }
884  return optional::nullopt;
885 }
886 
887 static
888 date_t two_to_four_digit_year(date_t year) {
889  if (year > 99) {
890  return year;
891  }
892  else if (year > 50) {
893  // 87 -> 1987
894  return year + 1900;
895  }
896  else {
897  // 15 -> 2015
898  return year + 2000;
899  }
900 }
901 
902 }
zxcvbn::repeat_match
std::vector< Match > repeat_match(const std::string &password)
Definition: matching.cpp:443
zxcvbn::SequenceTag
SequenceTag
Definition: common.hpp:37
zxcvbn::most_guessable_match_sequence
ScoringResult most_guessable_match_sequence(const std::string &password, std::vector< Match > &matches, bool exclude_additive)
Definition: scoring.cpp:93
zxcvbn::reverse_dictionary_match
std::vector< Match > reverse_dictionary_match(const std::string &password, const RankedDicts &ranked_dictionaries)
Definition: matching.cpp:175
b
void const uint64_t * b
Definition: field_5x52_asm_impl.h:10
zxcvbn::regex_match
std::vector< Match > regex_match(const std::string &password, const std::vector< std::pair< RegexTag, std::regex >> &regexen)
Definition: matching.cpp:603
zxcvbn::DATE_MAX_YEAR
const auto DATE_MAX_YEAR
Definition: matching.cpp:45
zxcvbn::DMY::year
date_t year
Definition: matching.cpp:637
common.hpp
zxcvbn::util::ascii_lower
std::string ascii_lower(const std::string &in)
Definition: util.cpp:15
zxcvbn::RankedDicts
std::unordered_map< DictionaryTag, const RankedDict & > RankedDicts
Definition: frequency_lists.hpp:30
zxcvbn::DictionaryMatch
Definition: common.hpp:67
zxcvbn
Definition: _frequency_lists.cpp:7
zxcvbn::enumerate_l33t_subs
std::vector< std::unordered_map< std::string, std::string > > enumerate_l33t_subs(const std::unordered_map< std::string, std::vector< std::string >> &table)
Definition: matching.cpp:221
zxcvbn::DMY
Definition: matching.cpp:636
zxcvbn::omnimatch
std::vector< Match > omnimatch(const std::string &password, const std::vector< std::string > &ordered_list)
Definition: matching.cpp:109
zxcvbn::util::character_len
std::string::size_type character_len(const std::string &str, std::string::size_type start, std::string::size_type end)
Definition: util.cpp:84
zxcvbn::default_ranked_dicts
RankedDicts default_ranked_dicts()
Definition: frequency_lists.cpp:19
zxcvbn::l33t_match
std::vector< Match > l33t_match(const std::string &password, const RankedDicts &ranked_dictionaries, const std::vector< std::pair< std::string, std::vector< std::string >>> &l33t_table)
Definition: matching.cpp:280
zxcvbn::RegexTag::RECENT_YEAR
@ RECENT_YEAR
zxcvbn::spatial_match
std::vector< Match > spatial_match(const std::string &password, const Graphs &graphs)
Definition: matching.cpp:337
zxcvbn::util::reverse_string
std::string reverse_string(const std::string &in)
Definition: util.cpp:28
zxcvbn::util::utf8_decode
std::pair< char32_t, std::string::iterator > utf8_decode(std::string::iterator start, std::string::iterator end)
Definition: util.cpp:124
zxcvbn::Graphs
std::unordered_map< GraphTag, Graph > Graphs
Definition: adjacency_graphs.hpp:39
zxcvbn::SHIFTED_RX
const auto SHIFTED_RX
Definition: matching.cpp:347
zxcvbn::DateMatch
Definition: common.hpp:115
zxcvbn::RepeatMatch
Definition: common.hpp:91
zxcvbn::L33T_TABLE
const std::vector< std::pair< std::string, std::vector< std::string > > > L33T_TABLE
Definition: matching.hpp:13
zxcvbn::REFERENCE_YEAR
const auto REFERENCE_YEAR
Definition: scoring.hpp:20
zxcvbn::util::utf8_iter
std::string::iterator utf8_iter(std::string::iterator start, std::string::iterator end)
Definition: util.cpp:74
zxcvbn::sequence_match
std::vector< Match > sequence_match(const std::string &password)
Definition: matching.cpp:512
scoring.hpp
zxcvbn::date_match
std::vector< Match > date_match(const std::string &password)
Definition: matching.cpp:648
zxcvbn::relevant_l33t_subtable
std::unordered_map< std::string, std::vector< std::string > > relevant_l33t_subtable(const std::string &password, const std::vector< std::pair< std::string, std::vector< std::string >>> &table)
Definition: matching.cpp:201
zxcvbn::GraphTag
GraphTag
Definition: adjacency_graphs.hpp:16
zxcvbn::dictionary_match
std::vector< Match > dictionary_match(const std::string &password, const RankedDicts &ranked_dictionaries)
Definition: matching.cpp:143
matching.hpp
zxcvbn::date_t
unsigned date_t
Definition: matching.cpp:634
zxcvbn::Graph
std::unordered_map< std::string, std::vector< optional::optional< std::string > >> Graph
Definition: adjacency_graphs.hpp:38
zxcvbn::idx_t
std::string::size_type idx_t
Definition: common.hpp:18
zxcvbn::graphs
const Graphs & graphs()
Definition: adjacency_graphs.cpp:270
frequency_lists.hpp
zxcvbn::RegexMatch
Definition: common.hpp:108
zxcvbn::optional::optional
Definition: optional.hpp:31
zxcvbn::MAX_DELTA
const auto MAX_DELTA
Definition: matching.cpp:511
zxcvbn::DATE_SPLITS
constexpr std::initializer_list< std::pair< int, int > > DATE_SPLITS[]
Definition: matching.cpp:47
adjacency_graphs.hpp
zxcvbn::SequenceMatch
Definition: common.hpp:100
zxcvbn::PortableRegexMatch
Definition: common.hpp:44
zxcvbn::Match
Definition: common.hpp:133
zxcvbn::REGEXEN
const std::vector< std::pair< RegexTag, std::regex > > REGEXEN
Definition: matching.hpp:14
util.hpp
zxcvbn::build_ranked_dict
RankedDict build_ranked_dict(const T &ordered_list)
Definition: frequency_lists_common.hpp:16
zxcvbn::DATE_MIN_YEAR
const auto DATE_MIN_YEAR
Definition: matching.cpp:46
zxcvbn::DateMatch::year
unsigned year
Definition: common.hpp:119
optional.hpp