Remove a large swath of unmaintained emoji regex generation code

2025-03-12 07:14:20 +00:00 · 2019-04-03 17:53:39 -07:00 · 2019-04-03 17:53:39 -07:00 · 6f4fd350d7
commit 6f4fd350d7
parent 0e43130c2d
1 changed files with 6 additions and 445 deletions
--- a/2/scripts/generate
+++ b/2/scripts/generate
@ -15,450 +15,7 @@ function file(which) {
  return path.join(__dirname, '../..', which);
 }

-// Twitter assets by property name
-var assets = {
-  '2/72x72': [],
-  '2/svg': []
-};
-
-var skinToneOptions = [
-  '\\ud83c\\udffb',
-  '\\ud83c\\udffc',
-  '\\ud83c\\udffd',
-  '\\ud83c\\udffe',
-  '\\ud83c\\udfff'
-];
-
-// white spaces we don't want to catch via the RegExp
-// there is no asset equivalent for these
-var ignoreMissing = ['2002', '2003', '2005'];
-
-// Items is an array of unicode sequences with \u escaping, like ["\u2963\ufe0f", "\u263a\ufe0f"]
-// items get sorted by length (long to short), then unicode hex values (low to high)
-// output is "or" ed together using | for regex
-// ouput also combines adjacent items using character classes with ranges when they have common prefixes
-// Example: "aab", "aac", "aad", "aag", "ba" becomes "aa[b-dg]|ba"
-function generateRegexPartial(items) {
-  var currentPrefix = null;
-  var result = [];
-  var charClass = [];
-  var charRange = [];
-  items.map(function (item) {
-    // Convert from "\u2963\ufe0f" into ["2963", "fe0f"]
-    return item.split('\\u').slice(1);
-  }).sort(sortMethod).forEach(function (itemParts) {
-    var prefix = itemParts.slice(0, -1).join('\\u');
-    if (prefix) {
-      prefix = '\\u' + prefix;
-    }
-    var suffix = itemParts.slice(-1);
-    if (prefix !== currentPrefix) {
-      flushCharClass();
-    }
-    currentPrefix = prefix;
-    var suffixMinusOne = Utils.UTF162JSON(String.fromCharCode(parseInt(suffix, 16) - 1));
-
-    if (charRange.length && charRange.slice(-1)[0] !== suffixMinusOne) {
-      flushCharRange();
-    }
-    charRange.push('\\u' + suffix);
-  });
-
-  flushCharClass();
-  return result.join('|');
-
-  // a and b are arrays of hex UCS-2 units
-  function sortMethod(a, b) {
-    return !a.length ? 0 :
-      b.length - a.length ||
-      parseInt(a[0], 16) - parseInt(b[0], 16) ||
-      sortMethod(a.slice(1), b.slice(1)
-    );
-  }
-
-  function flushCharRange() {
-    charClass = charClass.concat((charRange.length < 3) ?
-      charRange :
-      [ charRange[0], '-', charRange.slice(-1)[0] ]
-    );
-    charRange = [];
-  }
-
-  function flushCharClass() {
-    flushCharRange();
-    if (charClass.length) {
-      result.push(currentPrefix + (charClass.length == 1 ?
-        charClass[0] :
-        '[' + charClass.join('') + ']'
-      ));
-    }
-    charClass = [];
-    currentPrefix = null;
-  }
-}
-
-// basic utility to organize async code
-// see: http://webreflection.blogspot.co.uk/2012/03/tweet-sized-queue-system.html
-// or:  http://webreflection.blogspot.co.uk/2012/06/working-with-queues.html
-function Queue(args, f) {
-  setTimeout(args.next = function next() {
-    return (f = args.shift()) ? !!f(args) || !0 : !1;
-  }, 0);
-  return args;
-}
-
-// main task
-Queue([
-
-  // will populate assets arrays
-  function grabAllAssets(q) {
-    console.log('analyzing all assets ... ');
-    // per each path/folder
-    Object.keys(assets).forEach(function (path, i, paths) {
-      // grab all files in that folder
-      fs.readdir(file(path), function (err, files) {
-        // and add them to the assets path
-        assets[path].push.apply(
-          assets[path],
-          files.map(upperCaseWithoutExtension)
-        );
-        // once all assets arrays have been populated
-        if (paths.reduce(completed, true)) {
-          console.log('[INFO] assets contains ' + assets[path].length + ' emoji.');
-          q.next();
-        }
-      });
-    });
-    // drop extension + uppercase
-    function upperCaseWithoutExtension(file) {
-      return file.slice(0, file.lastIndexOf('.')).toUpperCase();
-    }
-    // returns true if all assets have been populated
-    function completed(p, c) {
-      return p && assets[c].length;
-    }
-  },
-
-  // will fetch and store all emoji from unicode.org
-  function fetchEmojiSources(q) {
-    console.log('fetching EmojiSources.txt ... ');
-    // grab all emoji and test them against them
-    http.get("http://www.unicode.org/Public/UNIDATA/EmojiSources.txt", function (res) {
-      var chunks = [];
-      // if all good ...
-      if (res.statusCode === 200) {
-        // grab all data
-        res.on('data', chunks.push.bind(chunks));
-        // once done ...
-        res.on('end', function () {
-          console.log('analyzing EmojiSources VS our assets ... ');
-          // store all missing assets in one object
-          var missing = {};
-          // will be used to store an array with all missing
-          var missingGrouped = {};
-
-          // will be needed later on
-          // parse it, clean it, and store it once
-          q.emojiSource = chunks
-            .join('')
-            .split(/\r\n|\r|\n/)
-            // filter once
-            .filter(function (line) {
-              return this.test(line);
-            }, /^[0-9A-F]/)
-            // take only emoji info
-            .map(function (codePoint) {
-              return codePoint
-                .slice(0, codePoint.indexOf(';'))
-                .toUpperCase()
-                // drop spaces
-                .replace(/\s+/g, '-')
-                // drop 0 padded prefixes
-                .replace(/^0+/g, '');
-            });
-
-          console.log('[INFO] parsed ' + q.emojiSource.length + ' standard emoji.');
-
-          // find out which one is missing from our assets
-          q.emojiSource.forEach(
-            function (emoji) {
-              // do not loop for emoji we know we should ignore
-              if (ignoreMissing.indexOf(emoji) < 0) {
-                // verify all others per each folder
-                this.forEach(function (path) {
-                  if (assets[path].indexOf(emoji) < 0) {
-                    (missing[path] || (missing[path] = [])).push(emoji);
-                    missingGrouped[emoji] = true;
-                  }
-                });
-              }
-            },
-            // and per each folder
-            Object.keys(assets)
-          );
-
-          // if some missing emoji has been found
-          if (Object.keys(missing).length) {
-            // warn and show which one is missing
-            console.warn('[WARNING] missing assets for:');
-            console.log(missing);
-          }
-          // create the array of all emoji we should ignore
-          q.ignore = ignoreMissing.concat(Object.keys(missingGrouped));
-
-          q.next();
-        });
-      } else {
-        console.error('[ERROR] unable to fetch emoji at unicode.org');
-        process.exit(1);
-      }
-    });
-  },
-
-  // grab the list of emoji that behave differently when
-  // variants such \uFE0E and \uFE0F are in place
-  function grabStandardVariants(q) {
-    console.log('fetching StandardizedVariants.txt ... ');
-    http.get(
-      "http://unicode.org/Public/UNIDATA/StandardizedVariants.txt",
-      function(res) {
-        var chunks = [];
-        if (res.statusCode == 200) {
-          res.on('data', chunks.push.bind(chunks));
-          res.on('end', function () {
-            // cleaning up parsing sensitive emoji
-            q.variantsSensitive = chunks
-              .join('')                         // all content
-              .split(/\r\n|\r|\n/)              // split in lines
-              .filter(function (line) {         // containing FE0E; info
-                return this.test(line);         // avoiding duplicated with FE0F
-              }, / FE0E; text style/)
-              .map(function (line) {            // cleaned up to grab
-                return line.replace(this, '$1') // only first unicode
-                        .toUpperCase();         // normalized as uppercase
-              }, /^([0-9A-F]{4,}) FE0E;.+$/)    // sensitive char
-            ;
-
-            // iOS keyboard allows U+002A U+FE0F U+20E3 even though not a standardized variant (yet?)
-            q.variantsSensitive.push('002A');
-            // iOS keyboard allows U+2639 U+FE0F even though not a standardized variant (yet?)
-            q.variantsSensitive.push('2639');
-
-            console.log('[INFO] parsed ' + q.variantsSensitive.length + ' variant sensitive emoji.');
-            q.next();
-
-          });
-        } else {
-          console.error('[ERROR] unable to fetch standard variants at unicode.org');
-          process.exit(1);
-        }
-      }
-    );
-  },
-
-  // add our own assets that are not part of the Unicode standard
-  function addMissingEmoji(q) {
-    q.nonStandard = [];
-    Object.keys(assets).forEach(function (path, i) {
-      assets[path].forEach(function (emoji) {
-        if (
-          q.emojiSource.indexOf(emoji) < 0 &&
-          q.nonStandard.indexOf(emoji) < 0
-        ) {
-          q.nonStandard.push(emoji);
-        }
-      });
-    });
-
-    if (q.nonStandard.length) {
-      console.warn('[WARNING] assets contain ' + q.nonStandard.length + ' non standard emoji:');
-      // console.log(q.nonStandard.join(', '));
-    }
-
-    q.emojiSource = q.emojiSource.concat(q.nonStandard)
-    q.next();
-  },
-
-  // detect complete sets of five skin tones and a base
-  function detectDiversityEmoji(q) {
-    var isPresent = {};
-    q.emojiSource.forEach(function (codePoints) {
-      isPresent[codePoints] = true;
-    });
-    q.diversityBase = q.emojiSource.filter(function (codePoints) {
-      // Start with the set of Emoji with the light skin tone
-      return /-1F3FB$/.test(codePoints);
-    }).map(function (codePoints) {
-      // Take the skin tone off
-      return codePoints.replace(/-1F3FB$/, '');
-    }).filter(function (baseCodePoints) {
-      // Verify that all other skin tones + no skin tone are present
-      return ['-1F3FC', '-1F3FD', '-1F3FE', '-1F3FF', ''].every(function (suffix) {
-        return isPresent[baseCodePoints + suffix];
-      });
-    });
-    console.log('[INFO] parsed ' + q.diversityBase.length + ' diversity emoji.');
-    q.next();
-  },
-
-  // detect complete sets of five skin tones and a base
-  function partitionEmojiTypes(q) {
-    console.log('partitioning emoji into types');
-    q.zwj = [];
-    q.diversity = [];
-    q.sensitive = [];
-    q.sensitiveKeycaps = [];
-    q.diversitySensitive = [];
-    q.regular = [];
-    q.emojiSource.forEach(function (codePoints) {
-      var u;
-      var codePointsWithoutKeycap;
-      codePoints = codePoints.replace(/\b[A-F0-9]+\b/g, function (hex) {
-        // Pad all hex numbers to have at least 4 digits to match variantsSensitive
-        return hex.length < 4 ? ('000' + hex).slice(-4) : hex;
-      });
-      if (q.ignore.indexOf(codePoints) < 0) {
-        u = Utils.toJSON(codePoints);
-        codePointsWithoutKeycap = codePoints.replace(/-20E3$/, '');
-        if (codePoints.indexOf('200D') >= 0) {
-          q.zwj.push(u);
-        } else if (codePoints != codePointsWithoutKeycap && q.variantsSensitive.indexOf(codePointsWithoutKeycap) >= 0) {
-          q.sensitiveKeycaps.push(Utils.toJSON(codePointsWithoutKeycap));
-        } else if (q.diversityBase.indexOf(codePoints.replace(/-1F3F[B-F]$/, '')) >= 0) {
-          // This is a diversity Emoji with or without a skin tone modifier
-          // Add it to the regex if this is the base without the modifier
-          if (q.diversityBase.indexOf(codePoints) >= 0) {
-            if (q.variantsSensitive.indexOf(codePoints) < 0) {
-              q.diversity.push(u);
-            } else {
-              q.diversitySensitive.push(u);
-            }
-          }
-        } else if (q.variantsSensitive.indexOf(codePoints) < 0) {
-          q.regular.push(u);
-        } else {
-          q.sensitive.push(u);
-        }
-      }
-    });
-    q.next();
-  },
-
-  function factorZwjSequences(q) {
-    q.zwjCommonPatterns = [];
-
-    // There are dozens of new ZWJ sequences that have common prefixes or suffixes with
-    // skin tone + gender variations. To keep the main regex from growing excessively large and
-    // slow, choose some common sub-expressions to factor.
-    var commonPatterns = [
-      {
-        name: 'leading man/woman zwj with optional skin tone',
-        re: '\\ud83d[\\udc68-\\udc69](?:\\ud83c[\\udffb-\\udfff])?\\u200d(.+?)',
-        numCombinations: 12
-      }, {
-        name: 'variant or skin tone before trailing female/male zwj',
-        re: '(.+?)(?:\\ufe0f|\\ud83c[\\udffb-\\udfff])\\u200d[\\u2640\\u2642]\\ufe0f',
-        numCombinations: 12
-      }, {
-        name: 'optional skin tone before trailing female/male zwj',
-        re: '(.+?)(?:\\ud83c[\\udffb-\\udfff])?\\u200d[\\u2640\\u2642]\\ufe0f',
-        numCombinations: 12
-      }
-    ];
-
-    commonPatterns.forEach(function(pattern) {
-      var mapOfMatches = {};
-      var re = new RegExp('^' + pattern.re + '$');
-      q.zwj.forEach(function(jsonString) {
-        var rawString = JSON.parse('"' + jsonString + '"');
-        var match = rawString.match(re);
-        if (match) {
-          var key = match[1];
-          mapOfMatches[key] = mapOfMatches[key] || [];
-          mapOfMatches[key].push(match[0]);
-        }
-      });
-      var replacements = [];
-      Object.keys(mapOfMatches).forEach(function(key) {
-        var matches = mapOfMatches[key];
-        // Only a complete set may be replaced
-        if (matches.length === pattern.numCombinations) {
-          replacements.push(Utils.UTF162JSON(key));
-          // Remove all items in the match set from the original zwj list
-          matches.forEach(function(rawString) {
-            var indexToRemove = q.zwj.indexOf(Utils.UTF162JSON(rawString));
-            if (indexToRemove >= 0) {
-              q.zwj.splice(indexToRemove, 1);
-            }
-          });
-        }
-      });
-      if (replacements.length) {
-        // Replace the wildcard section of the regex with a regex group of replacements
-        var re = pattern.re.replace('(.+?', '(?:' + generateRegexPartial(replacements));
-        q.zwjCommonPatterns.push(re);
-        console.log('Refactoring ' + replacements.length + ' complete sets of ' + pattern.numCombinations + ' zwj from ' + pattern.name);
-      } else {
-        console.log('did not find any complete sets of ' + pattern.name);
-      }
-    });
-
-    q.next();
-  },
-
-  // with all info, generate a RegExp that will catch
-  // only standard emoji that are present in our assets
-  function generateRegExp(q) {
-    console.log('generating a RegExp for available assets');
-    q.re = '';
-
-    // The Zero-width joiner common patterns, if present, need to come first
-    if (q.zwjCommonPatterns.length) {
-      q.re += q.zwjCommonPatterns.join('|') + '|';
-    }
-
-    // Then the rest of the zwjs
-    if (q.zwj.length) {
-      q.re += generateRegexPartial(q.zwj) + '|';
-    }
-
-    // Group the variant sensitive keycaps
-    if (q.sensitiveKeycaps.length) {
-      q.re += '(?:' + generateRegexPartial(q.sensitiveKeycaps) + ')\\ufe0f?\\u20e3|';
-    }
-
-    // Next, add the diversity enabled Emoji that may include a skin tone suffix
-    if (q.diversity.length + q.diversitySensitive.length) {
-      q.re += '(?:';
-      if (q.diversitySensitive.length) {
-        // Some diversity are sensitive to variants
-        q.re += '(?:' + generateRegexPartial(q.diversitySensitive) + ')(?:\\ufe0f|(?!\\ufe0e))';
-        if (q.diversity.length) {
-          q.re += '|';
-        }
-      }
-      q.re += generateRegexPartial(q.diversity) + ')(?:' + generateRegexPartial(skinToneOptions) + '|)|';
-    }
-
-    // Next, the normal Emoji
-    q.re += generateRegexPartial(q.regular) + '|';
-
-    // Finally, add the rest of the sensitive ones that may be followed by U+FE0F but not U+FE0E
-    q.re += '(?:' + generateRegexPartial(q.sensitive) + ')(?:\\ufe0f|(?!\\ufe0e))';
-    q.next();
-  },
-
-  function generateFile(q) {
-    console.log('generating ./twemoji.js');
-    createTwemoji(q.re);
-    require('./create-dist');
-  }
-
-]);
-
-
-
-function createTwemoji(re) {
+function createTwemoji() {
  fs.writeFileSync(
    file('2/twemoji.js'),
    '/*jslint indent: 2, browser: true, bitwise: true, plusplus: true */\n' +
@ -1042,4 +599,8 @@ function createTwemoji(re) {
        ) +
        '\n  */'
      ) + '());');
-}
+
+}
+
+createTwemoji();
+require('./create-dist');