_unicodeWords.js 3.0KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970
  1. /** Used to compose unicode character classes. */
  2. var rsAstralRange = '\\ud800-\\udfff',
  3. rsComboMarksRange = '\\u0300-\\u036f',
  4. reComboHalfMarksRange = '\\ufe20-\\ufe2f',
  5. rsComboSymbolsRange = '\\u20d0-\\u20ff',
  6. rsComboRange = rsComboMarksRange + reComboHalfMarksRange + rsComboSymbolsRange,
  7. rsDingbatRange = '\\u2700-\\u27bf',
  8. rsLowerRange = 'a-z\\xdf-\\xf6\\xf8-\\xff',
  9. rsMathOpRange = '\\xac\\xb1\\xd7\\xf7',
  10. rsNonCharRange = '\\x00-\\x2f\\x3a-\\x40\\x5b-\\x60\\x7b-\\xbf',
  11. rsPunctuationRange = '\\u2000-\\u206f',
  12. rsSpaceRange = ' \\t\\x0b\\f\\xa0\\ufeff\\n\\r\\u2028\\u2029\\u1680\\u180e\\u2000\\u2001\\u2002\\u2003\\u2004\\u2005\\u2006\\u2007\\u2008\\u2009\\u200a\\u202f\\u205f\\u3000',
  13. rsUpperRange = 'A-Z\\xc0-\\xd6\\xd8-\\xde',
  14. rsVarRange = '\\ufe0e\\ufe0f',
  15. rsBreakRange = rsMathOpRange + rsNonCharRange + rsPunctuationRange + rsSpaceRange;
  16. /** Used to compose unicode capture groups. */
  17. var rsApos = "['\u2019]",
  18. rsBreak = '[' + rsBreakRange + ']',
  19. rsCombo = '[' + rsComboRange + ']',
  20. rsDigits = '\\d+',
  21. rsDingbat = '[' + rsDingbatRange + ']',
  22. rsLower = '[' + rsLowerRange + ']',
  23. rsMisc = '[^' + rsAstralRange + rsBreakRange + rsDigits + rsDingbatRange + rsLowerRange + rsUpperRange + ']',
  24. rsFitz = '\\ud83c[\\udffb-\\udfff]',
  25. rsModifier = '(?:' + rsCombo + '|' + rsFitz + ')',
  26. rsNonAstral = '[^' + rsAstralRange + ']',
  27. rsRegional = '(?:\\ud83c[\\udde6-\\uddff]){2}',
  28. rsSurrPair = '[\\ud800-\\udbff][\\udc00-\\udfff]',
  29. rsUpper = '[' + rsUpperRange + ']',
  30. rsZWJ = '\\u200d';
  31. /** Used to compose unicode regexes. */
  32. var rsMiscLower = '(?:' + rsLower + '|' + rsMisc + ')',
  33. rsMiscUpper = '(?:' + rsUpper + '|' + rsMisc + ')',
  34. rsOptContrLower = '(?:' + rsApos + '(?:d|ll|m|re|s|t|ve))?',
  35. rsOptContrUpper = '(?:' + rsApos + '(?:D|LL|M|RE|S|T|VE))?',
  36. reOptMod = rsModifier + '?',
  37. rsOptVar = '[' + rsVarRange + ']?',
  38. rsOptJoin = '(?:' + rsZWJ + '(?:' + [rsNonAstral, rsRegional, rsSurrPair].join('|') + ')' + rsOptVar + reOptMod + ')*',
  39. rsOrdLower = '\\d*(?:1st|2nd|3rd|(?![123])\\dth)(?=\\b|[A-Z_])',
  40. rsOrdUpper = '\\d*(?:1ST|2ND|3RD|(?![123])\\dTH)(?=\\b|[a-z_])',
  41. rsSeq = rsOptVar + reOptMod + rsOptJoin,
  42. rsEmoji = '(?:' + [rsDingbat, rsRegional, rsSurrPair].join('|') + ')' + rsSeq;
  43. /** Used to match complex or compound words. */
  44. var reUnicodeWord = RegExp([
  45. rsUpper + '?' + rsLower + '+' + rsOptContrLower + '(?=' + [rsBreak, rsUpper, '$'].join('|') + ')',
  46. rsMiscUpper + '+' + rsOptContrUpper + '(?=' + [rsBreak, rsUpper + rsMiscLower, '$'].join('|') + ')',
  47. rsUpper + '?' + rsMiscLower + '+' + rsOptContrLower,
  48. rsUpper + '+' + rsOptContrUpper,
  49. rsOrdUpper,
  50. rsOrdLower,
  51. rsDigits,
  52. rsEmoji
  53. ].join('|'), 'g');
  54. /**
  55. * Splits a Unicode `string` into an array of its words.
  56. *
  57. * @private
  58. * @param {string} The string to inspect.
  59. * @returns {Array} Returns the words of `string`.
  60. */
  61. function unicodeWords(string) {
  62. return string.match(reUnicodeWord) || [];
  63. }
  64. module.exports = unicodeWords;