1
0
mirror of https://github.com/twitter/twemoji.git synced 2025-03-12 07:14:20 +00:00

Remove a large swath of unmaintained emoji regex generation code

This commit is contained in:
Nathan Downs 2019-04-03 17:53:39 -07:00
parent 0e43130c2d
commit 6f4fd350d7

View File

@ -15,450 +15,7 @@ function file(which) {
return path.join(__dirname, '../..', which);
}
// Twitter assets by property name
var assets = {
'2/72x72': [],
'2/svg': []
};
var skinToneOptions = [
'\\ud83c\\udffb',
'\\ud83c\\udffc',
'\\ud83c\\udffd',
'\\ud83c\\udffe',
'\\ud83c\\udfff'
];
// white spaces we don't want to catch via the RegExp
// there is no asset equivalent for these
var ignoreMissing = ['2002', '2003', '2005'];
// Items is an array of unicode sequences with \u escaping, like ["\u2963\ufe0f", "\u263a\ufe0f"]
// items get sorted by length (long to short), then unicode hex values (low to high)
// output is "or" ed together using | for regex
// ouput also combines adjacent items using character classes with ranges when they have common prefixes
// Example: "aab", "aac", "aad", "aag", "ba" becomes "aa[b-dg]|ba"
function generateRegexPartial(items) {
var currentPrefix = null;
var result = [];
var charClass = [];
var charRange = [];
items.map(function (item) {
// Convert from "\u2963\ufe0f" into ["2963", "fe0f"]
return item.split('\\u').slice(1);
}).sort(sortMethod).forEach(function (itemParts) {
var prefix = itemParts.slice(0, -1).join('\\u');
if (prefix) {
prefix = '\\u' + prefix;
}
var suffix = itemParts.slice(-1);
if (prefix !== currentPrefix) {
flushCharClass();
}
currentPrefix = prefix;
var suffixMinusOne = Utils.UTF162JSON(String.fromCharCode(parseInt(suffix, 16) - 1));
if (charRange.length && charRange.slice(-1)[0] !== suffixMinusOne) {
flushCharRange();
}
charRange.push('\\u' + suffix);
});
flushCharClass();
return result.join('|');
// a and b are arrays of hex UCS-2 units
function sortMethod(a, b) {
return !a.length ? 0 :
b.length - a.length ||
parseInt(a[0], 16) - parseInt(b[0], 16) ||
sortMethod(a.slice(1), b.slice(1)
);
}
function flushCharRange() {
charClass = charClass.concat((charRange.length < 3) ?
charRange :
[ charRange[0], '-', charRange.slice(-1)[0] ]
);
charRange = [];
}
function flushCharClass() {
flushCharRange();
if (charClass.length) {
result.push(currentPrefix + (charClass.length == 1 ?
charClass[0] :
'[' + charClass.join('') + ']'
));
}
charClass = [];
currentPrefix = null;
}
}
// basic utility to organize async code
// see: http://webreflection.blogspot.co.uk/2012/03/tweet-sized-queue-system.html
// or: http://webreflection.blogspot.co.uk/2012/06/working-with-queues.html
function Queue(args, f) {
setTimeout(args.next = function next() {
return (f = args.shift()) ? !!f(args) || !0 : !1;
}, 0);
return args;
}
// main task
Queue([
// will populate assets arrays
function grabAllAssets(q) {
console.log('analyzing all assets ... ');
// per each path/folder
Object.keys(assets).forEach(function (path, i, paths) {
// grab all files in that folder
fs.readdir(file(path), function (err, files) {
// and add them to the assets path
assets[path].push.apply(
assets[path],
files.map(upperCaseWithoutExtension)
);
// once all assets arrays have been populated
if (paths.reduce(completed, true)) {
console.log('[INFO] assets contains ' + assets[path].length + ' emoji.');
q.next();
}
});
});
// drop extension + uppercase
function upperCaseWithoutExtension(file) {
return file.slice(0, file.lastIndexOf('.')).toUpperCase();
}
// returns true if all assets have been populated
function completed(p, c) {
return p && assets[c].length;
}
},
// will fetch and store all emoji from unicode.org
function fetchEmojiSources(q) {
console.log('fetching EmojiSources.txt ... ');
// grab all emoji and test them against them
http.get("http://www.unicode.org/Public/UNIDATA/EmojiSources.txt", function (res) {
var chunks = [];
// if all good ...
if (res.statusCode === 200) {
// grab all data
res.on('data', chunks.push.bind(chunks));
// once done ...
res.on('end', function () {
console.log('analyzing EmojiSources VS our assets ... ');
// store all missing assets in one object
var missing = {};
// will be used to store an array with all missing
var missingGrouped = {};
// will be needed later on
// parse it, clean it, and store it once
q.emojiSource = chunks
.join('')
.split(/\r\n|\r|\n/)
// filter once
.filter(function (line) {
return this.test(line);
}, /^[0-9A-F]/)
// take only emoji info
.map(function (codePoint) {
return codePoint
.slice(0, codePoint.indexOf(';'))
.toUpperCase()
// drop spaces
.replace(/\s+/g, '-')
// drop 0 padded prefixes
.replace(/^0+/g, '');
});
console.log('[INFO] parsed ' + q.emojiSource.length + ' standard emoji.');
// find out which one is missing from our assets
q.emojiSource.forEach(
function (emoji) {
// do not loop for emoji we know we should ignore
if (ignoreMissing.indexOf(emoji) < 0) {
// verify all others per each folder
this.forEach(function (path) {
if (assets[path].indexOf(emoji) < 0) {
(missing[path] || (missing[path] = [])).push(emoji);
missingGrouped[emoji] = true;
}
});
}
},
// and per each folder
Object.keys(assets)
);
// if some missing emoji has been found
if (Object.keys(missing).length) {
// warn and show which one is missing
console.warn('[WARNING] missing assets for:');
console.log(missing);
}
// create the array of all emoji we should ignore
q.ignore = ignoreMissing.concat(Object.keys(missingGrouped));
q.next();
});
} else {
console.error('[ERROR] unable to fetch emoji at unicode.org');
process.exit(1);
}
});
},
// grab the list of emoji that behave differently when
// variants such \uFE0E and \uFE0F are in place
function grabStandardVariants(q) {
console.log('fetching StandardizedVariants.txt ... ');
http.get(
"http://unicode.org/Public/UNIDATA/StandardizedVariants.txt",
function(res) {
var chunks = [];
if (res.statusCode == 200) {
res.on('data', chunks.push.bind(chunks));
res.on('end', function () {
// cleaning up parsing sensitive emoji
q.variantsSensitive = chunks
.join('') // all content
.split(/\r\n|\r|\n/) // split in lines
.filter(function (line) { // containing FE0E; info
return this.test(line); // avoiding duplicated with FE0F
}, / FE0E; text style/)
.map(function (line) { // cleaned up to grab
return line.replace(this, '$1') // only first unicode
.toUpperCase(); // normalized as uppercase
}, /^([0-9A-F]{4,}) FE0E;.+$/) // sensitive char
;
// iOS keyboard allows U+002A U+FE0F U+20E3 even though not a standardized variant (yet?)
q.variantsSensitive.push('002A');
// iOS keyboard allows U+2639 U+FE0F even though not a standardized variant (yet?)
q.variantsSensitive.push('2639');
console.log('[INFO] parsed ' + q.variantsSensitive.length + ' variant sensitive emoji.');
q.next();
});
} else {
console.error('[ERROR] unable to fetch standard variants at unicode.org');
process.exit(1);
}
}
);
},
// add our own assets that are not part of the Unicode standard
function addMissingEmoji(q) {
q.nonStandard = [];
Object.keys(assets).forEach(function (path, i) {
assets[path].forEach(function (emoji) {
if (
q.emojiSource.indexOf(emoji) < 0 &&
q.nonStandard.indexOf(emoji) < 0
) {
q.nonStandard.push(emoji);
}
});
});
if (q.nonStandard.length) {
console.warn('[WARNING] assets contain ' + q.nonStandard.length + ' non standard emoji:');
// console.log(q.nonStandard.join(', '));
}
q.emojiSource = q.emojiSource.concat(q.nonStandard)
q.next();
},
// detect complete sets of five skin tones and a base
function detectDiversityEmoji(q) {
var isPresent = {};
q.emojiSource.forEach(function (codePoints) {
isPresent[codePoints] = true;
});
q.diversityBase = q.emojiSource.filter(function (codePoints) {
// Start with the set of Emoji with the light skin tone
return /-1F3FB$/.test(codePoints);
}).map(function (codePoints) {
// Take the skin tone off
return codePoints.replace(/-1F3FB$/, '');
}).filter(function (baseCodePoints) {
// Verify that all other skin tones + no skin tone are present
return ['-1F3FC', '-1F3FD', '-1F3FE', '-1F3FF', ''].every(function (suffix) {
return isPresent[baseCodePoints + suffix];
});
});
console.log('[INFO] parsed ' + q.diversityBase.length + ' diversity emoji.');
q.next();
},
// detect complete sets of five skin tones and a base
function partitionEmojiTypes(q) {
console.log('partitioning emoji into types');
q.zwj = [];
q.diversity = [];
q.sensitive = [];
q.sensitiveKeycaps = [];
q.diversitySensitive = [];
q.regular = [];
q.emojiSource.forEach(function (codePoints) {
var u;
var codePointsWithoutKeycap;
codePoints = codePoints.replace(/\b[A-F0-9]+\b/g, function (hex) {
// Pad all hex numbers to have at least 4 digits to match variantsSensitive
return hex.length < 4 ? ('000' + hex).slice(-4) : hex;
});
if (q.ignore.indexOf(codePoints) < 0) {
u = Utils.toJSON(codePoints);
codePointsWithoutKeycap = codePoints.replace(/-20E3$/, '');
if (codePoints.indexOf('200D') >= 0) {
q.zwj.push(u);
} else if (codePoints != codePointsWithoutKeycap && q.variantsSensitive.indexOf(codePointsWithoutKeycap) >= 0) {
q.sensitiveKeycaps.push(Utils.toJSON(codePointsWithoutKeycap));
} else if (q.diversityBase.indexOf(codePoints.replace(/-1F3F[B-F]$/, '')) >= 0) {
// This is a diversity Emoji with or without a skin tone modifier
// Add it to the regex if this is the base without the modifier
if (q.diversityBase.indexOf(codePoints) >= 0) {
if (q.variantsSensitive.indexOf(codePoints) < 0) {
q.diversity.push(u);
} else {
q.diversitySensitive.push(u);
}
}
} else if (q.variantsSensitive.indexOf(codePoints) < 0) {
q.regular.push(u);
} else {
q.sensitive.push(u);
}
}
});
q.next();
},
function factorZwjSequences(q) {
q.zwjCommonPatterns = [];
// There are dozens of new ZWJ sequences that have common prefixes or suffixes with
// skin tone + gender variations. To keep the main regex from growing excessively large and
// slow, choose some common sub-expressions to factor.
var commonPatterns = [
{
name: 'leading man/woman zwj with optional skin tone',
re: '\\ud83d[\\udc68-\\udc69](?:\\ud83c[\\udffb-\\udfff])?\\u200d(.+?)',
numCombinations: 12
}, {
name: 'variant or skin tone before trailing female/male zwj',
re: '(.+?)(?:\\ufe0f|\\ud83c[\\udffb-\\udfff])\\u200d[\\u2640\\u2642]\\ufe0f',
numCombinations: 12
}, {
name: 'optional skin tone before trailing female/male zwj',
re: '(.+?)(?:\\ud83c[\\udffb-\\udfff])?\\u200d[\\u2640\\u2642]\\ufe0f',
numCombinations: 12
}
];
commonPatterns.forEach(function(pattern) {
var mapOfMatches = {};
var re = new RegExp('^' + pattern.re + '$');
q.zwj.forEach(function(jsonString) {
var rawString = JSON.parse('"' + jsonString + '"');
var match = rawString.match(re);
if (match) {
var key = match[1];
mapOfMatches[key] = mapOfMatches[key] || [];
mapOfMatches[key].push(match[0]);
}
});
var replacements = [];
Object.keys(mapOfMatches).forEach(function(key) {
var matches = mapOfMatches[key];
// Only a complete set may be replaced
if (matches.length === pattern.numCombinations) {
replacements.push(Utils.UTF162JSON(key));
// Remove all items in the match set from the original zwj list
matches.forEach(function(rawString) {
var indexToRemove = q.zwj.indexOf(Utils.UTF162JSON(rawString));
if (indexToRemove >= 0) {
q.zwj.splice(indexToRemove, 1);
}
});
}
});
if (replacements.length) {
// Replace the wildcard section of the regex with a regex group of replacements
var re = pattern.re.replace('(.+?', '(?:' + generateRegexPartial(replacements));
q.zwjCommonPatterns.push(re);
console.log('Refactoring ' + replacements.length + ' complete sets of ' + pattern.numCombinations + ' zwj from ' + pattern.name);
} else {
console.log('did not find any complete sets of ' + pattern.name);
}
});
q.next();
},
// with all info, generate a RegExp that will catch
// only standard emoji that are present in our assets
function generateRegExp(q) {
console.log('generating a RegExp for available assets');
q.re = '';
// The Zero-width joiner common patterns, if present, need to come first
if (q.zwjCommonPatterns.length) {
q.re += q.zwjCommonPatterns.join('|') + '|';
}
// Then the rest of the zwjs
if (q.zwj.length) {
q.re += generateRegexPartial(q.zwj) + '|';
}
// Group the variant sensitive keycaps
if (q.sensitiveKeycaps.length) {
q.re += '(?:' + generateRegexPartial(q.sensitiveKeycaps) + ')\\ufe0f?\\u20e3|';
}
// Next, add the diversity enabled Emoji that may include a skin tone suffix
if (q.diversity.length + q.diversitySensitive.length) {
q.re += '(?:';
if (q.diversitySensitive.length) {
// Some diversity are sensitive to variants
q.re += '(?:' + generateRegexPartial(q.diversitySensitive) + ')(?:\\ufe0f|(?!\\ufe0e))';
if (q.diversity.length) {
q.re += '|';
}
}
q.re += generateRegexPartial(q.diversity) + ')(?:' + generateRegexPartial(skinToneOptions) + '|)|';
}
// Next, the normal Emoji
q.re += generateRegexPartial(q.regular) + '|';
// Finally, add the rest of the sensitive ones that may be followed by U+FE0F but not U+FE0E
q.re += '(?:' + generateRegexPartial(q.sensitive) + ')(?:\\ufe0f|(?!\\ufe0e))';
q.next();
},
function generateFile(q) {
console.log('generating ./twemoji.js');
createTwemoji(q.re);
require('./create-dist');
}
]);
function createTwemoji(re) {
function createTwemoji() {
fs.writeFileSync(
file('2/twemoji.js'),
'/*jslint indent: 2, browser: true, bitwise: true, plusplus: true */\n' +
@ -1042,4 +599,8 @@ function createTwemoji(re) {
) +
'\n */'
) + '());');
}
}
createTwemoji();
require('./create-dist');