mirror of
https://github.com/twitter/twemoji.git
synced 2025-03-12 07:14:20 +00:00
Remove a large swath of unmaintained emoji regex generation code
This commit is contained in:
parent
0e43130c2d
commit
6f4fd350d7
@ -15,450 +15,7 @@ function file(which) {
|
||||
return path.join(__dirname, '../..', which);
|
||||
}
|
||||
|
||||
// Twitter assets by property name
|
||||
var assets = {
|
||||
'2/72x72': [],
|
||||
'2/svg': []
|
||||
};
|
||||
|
||||
var skinToneOptions = [
|
||||
'\\ud83c\\udffb',
|
||||
'\\ud83c\\udffc',
|
||||
'\\ud83c\\udffd',
|
||||
'\\ud83c\\udffe',
|
||||
'\\ud83c\\udfff'
|
||||
];
|
||||
|
||||
// white spaces we don't want to catch via the RegExp
|
||||
// there is no asset equivalent for these
|
||||
var ignoreMissing = ['2002', '2003', '2005'];
|
||||
|
||||
// Items is an array of unicode sequences with \u escaping, like ["\u2963\ufe0f", "\u263a\ufe0f"]
|
||||
// items get sorted by length (long to short), then unicode hex values (low to high)
|
||||
// output is "or" ed together using | for regex
|
||||
// ouput also combines adjacent items using character classes with ranges when they have common prefixes
|
||||
// Example: "aab", "aac", "aad", "aag", "ba" becomes "aa[b-dg]|ba"
|
||||
function generateRegexPartial(items) {
|
||||
var currentPrefix = null;
|
||||
var result = [];
|
||||
var charClass = [];
|
||||
var charRange = [];
|
||||
items.map(function (item) {
|
||||
// Convert from "\u2963\ufe0f" into ["2963", "fe0f"]
|
||||
return item.split('\\u').slice(1);
|
||||
}).sort(sortMethod).forEach(function (itemParts) {
|
||||
var prefix = itemParts.slice(0, -1).join('\\u');
|
||||
if (prefix) {
|
||||
prefix = '\\u' + prefix;
|
||||
}
|
||||
var suffix = itemParts.slice(-1);
|
||||
if (prefix !== currentPrefix) {
|
||||
flushCharClass();
|
||||
}
|
||||
currentPrefix = prefix;
|
||||
var suffixMinusOne = Utils.UTF162JSON(String.fromCharCode(parseInt(suffix, 16) - 1));
|
||||
|
||||
if (charRange.length && charRange.slice(-1)[0] !== suffixMinusOne) {
|
||||
flushCharRange();
|
||||
}
|
||||
charRange.push('\\u' + suffix);
|
||||
});
|
||||
|
||||
flushCharClass();
|
||||
return result.join('|');
|
||||
|
||||
// a and b are arrays of hex UCS-2 units
|
||||
function sortMethod(a, b) {
|
||||
return !a.length ? 0 :
|
||||
b.length - a.length ||
|
||||
parseInt(a[0], 16) - parseInt(b[0], 16) ||
|
||||
sortMethod(a.slice(1), b.slice(1)
|
||||
);
|
||||
}
|
||||
|
||||
function flushCharRange() {
|
||||
charClass = charClass.concat((charRange.length < 3) ?
|
||||
charRange :
|
||||
[ charRange[0], '-', charRange.slice(-1)[0] ]
|
||||
);
|
||||
charRange = [];
|
||||
}
|
||||
|
||||
function flushCharClass() {
|
||||
flushCharRange();
|
||||
if (charClass.length) {
|
||||
result.push(currentPrefix + (charClass.length == 1 ?
|
||||
charClass[0] :
|
||||
'[' + charClass.join('') + ']'
|
||||
));
|
||||
}
|
||||
charClass = [];
|
||||
currentPrefix = null;
|
||||
}
|
||||
}
|
||||
|
||||
// basic utility to organize async code
|
||||
// see: http://webreflection.blogspot.co.uk/2012/03/tweet-sized-queue-system.html
|
||||
// or: http://webreflection.blogspot.co.uk/2012/06/working-with-queues.html
|
||||
function Queue(args, f) {
|
||||
setTimeout(args.next = function next() {
|
||||
return (f = args.shift()) ? !!f(args) || !0 : !1;
|
||||
}, 0);
|
||||
return args;
|
||||
}
|
||||
|
||||
// main task
|
||||
Queue([
|
||||
|
||||
// will populate assets arrays
|
||||
function grabAllAssets(q) {
|
||||
console.log('analyzing all assets ... ');
|
||||
// per each path/folder
|
||||
Object.keys(assets).forEach(function (path, i, paths) {
|
||||
// grab all files in that folder
|
||||
fs.readdir(file(path), function (err, files) {
|
||||
// and add them to the assets path
|
||||
assets[path].push.apply(
|
||||
assets[path],
|
||||
files.map(upperCaseWithoutExtension)
|
||||
);
|
||||
// once all assets arrays have been populated
|
||||
if (paths.reduce(completed, true)) {
|
||||
console.log('[INFO] assets contains ' + assets[path].length + ' emoji.');
|
||||
q.next();
|
||||
}
|
||||
});
|
||||
});
|
||||
// drop extension + uppercase
|
||||
function upperCaseWithoutExtension(file) {
|
||||
return file.slice(0, file.lastIndexOf('.')).toUpperCase();
|
||||
}
|
||||
// returns true if all assets have been populated
|
||||
function completed(p, c) {
|
||||
return p && assets[c].length;
|
||||
}
|
||||
},
|
||||
|
||||
// will fetch and store all emoji from unicode.org
|
||||
function fetchEmojiSources(q) {
|
||||
console.log('fetching EmojiSources.txt ... ');
|
||||
// grab all emoji and test them against them
|
||||
http.get("http://www.unicode.org/Public/UNIDATA/EmojiSources.txt", function (res) {
|
||||
var chunks = [];
|
||||
// if all good ...
|
||||
if (res.statusCode === 200) {
|
||||
// grab all data
|
||||
res.on('data', chunks.push.bind(chunks));
|
||||
// once done ...
|
||||
res.on('end', function () {
|
||||
console.log('analyzing EmojiSources VS our assets ... ');
|
||||
// store all missing assets in one object
|
||||
var missing = {};
|
||||
// will be used to store an array with all missing
|
||||
var missingGrouped = {};
|
||||
|
||||
// will be needed later on
|
||||
// parse it, clean it, and store it once
|
||||
q.emojiSource = chunks
|
||||
.join('')
|
||||
.split(/\r\n|\r|\n/)
|
||||
// filter once
|
||||
.filter(function (line) {
|
||||
return this.test(line);
|
||||
}, /^[0-9A-F]/)
|
||||
// take only emoji info
|
||||
.map(function (codePoint) {
|
||||
return codePoint
|
||||
.slice(0, codePoint.indexOf(';'))
|
||||
.toUpperCase()
|
||||
// drop spaces
|
||||
.replace(/\s+/g, '-')
|
||||
// drop 0 padded prefixes
|
||||
.replace(/^0+/g, '');
|
||||
});
|
||||
|
||||
console.log('[INFO] parsed ' + q.emojiSource.length + ' standard emoji.');
|
||||
|
||||
// find out which one is missing from our assets
|
||||
q.emojiSource.forEach(
|
||||
function (emoji) {
|
||||
// do not loop for emoji we know we should ignore
|
||||
if (ignoreMissing.indexOf(emoji) < 0) {
|
||||
// verify all others per each folder
|
||||
this.forEach(function (path) {
|
||||
if (assets[path].indexOf(emoji) < 0) {
|
||||
(missing[path] || (missing[path] = [])).push(emoji);
|
||||
missingGrouped[emoji] = true;
|
||||
}
|
||||
});
|
||||
}
|
||||
},
|
||||
// and per each folder
|
||||
Object.keys(assets)
|
||||
);
|
||||
|
||||
// if some missing emoji has been found
|
||||
if (Object.keys(missing).length) {
|
||||
// warn and show which one is missing
|
||||
console.warn('[WARNING] missing assets for:');
|
||||
console.log(missing);
|
||||
}
|
||||
// create the array of all emoji we should ignore
|
||||
q.ignore = ignoreMissing.concat(Object.keys(missingGrouped));
|
||||
|
||||
q.next();
|
||||
});
|
||||
} else {
|
||||
console.error('[ERROR] unable to fetch emoji at unicode.org');
|
||||
process.exit(1);
|
||||
}
|
||||
});
|
||||
},
|
||||
|
||||
// grab the list of emoji that behave differently when
|
||||
// variants such \uFE0E and \uFE0F are in place
|
||||
function grabStandardVariants(q) {
|
||||
console.log('fetching StandardizedVariants.txt ... ');
|
||||
http.get(
|
||||
"http://unicode.org/Public/UNIDATA/StandardizedVariants.txt",
|
||||
function(res) {
|
||||
var chunks = [];
|
||||
if (res.statusCode == 200) {
|
||||
res.on('data', chunks.push.bind(chunks));
|
||||
res.on('end', function () {
|
||||
// cleaning up parsing sensitive emoji
|
||||
q.variantsSensitive = chunks
|
||||
.join('') // all content
|
||||
.split(/\r\n|\r|\n/) // split in lines
|
||||
.filter(function (line) { // containing FE0E; info
|
||||
return this.test(line); // avoiding duplicated with FE0F
|
||||
}, / FE0E; text style/)
|
||||
.map(function (line) { // cleaned up to grab
|
||||
return line.replace(this, '$1') // only first unicode
|
||||
.toUpperCase(); // normalized as uppercase
|
||||
}, /^([0-9A-F]{4,}) FE0E;.+$/) // sensitive char
|
||||
;
|
||||
|
||||
// iOS keyboard allows U+002A U+FE0F U+20E3 even though not a standardized variant (yet?)
|
||||
q.variantsSensitive.push('002A');
|
||||
// iOS keyboard allows U+2639 U+FE0F even though not a standardized variant (yet?)
|
||||
q.variantsSensitive.push('2639');
|
||||
|
||||
console.log('[INFO] parsed ' + q.variantsSensitive.length + ' variant sensitive emoji.');
|
||||
q.next();
|
||||
|
||||
});
|
||||
} else {
|
||||
console.error('[ERROR] unable to fetch standard variants at unicode.org');
|
||||
process.exit(1);
|
||||
}
|
||||
}
|
||||
);
|
||||
},
|
||||
|
||||
// add our own assets that are not part of the Unicode standard
|
||||
function addMissingEmoji(q) {
|
||||
q.nonStandard = [];
|
||||
Object.keys(assets).forEach(function (path, i) {
|
||||
assets[path].forEach(function (emoji) {
|
||||
if (
|
||||
q.emojiSource.indexOf(emoji) < 0 &&
|
||||
q.nonStandard.indexOf(emoji) < 0
|
||||
) {
|
||||
q.nonStandard.push(emoji);
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
if (q.nonStandard.length) {
|
||||
console.warn('[WARNING] assets contain ' + q.nonStandard.length + ' non standard emoji:');
|
||||
// console.log(q.nonStandard.join(', '));
|
||||
}
|
||||
|
||||
q.emojiSource = q.emojiSource.concat(q.nonStandard)
|
||||
q.next();
|
||||
},
|
||||
|
||||
// detect complete sets of five skin tones and a base
|
||||
function detectDiversityEmoji(q) {
|
||||
var isPresent = {};
|
||||
q.emojiSource.forEach(function (codePoints) {
|
||||
isPresent[codePoints] = true;
|
||||
});
|
||||
q.diversityBase = q.emojiSource.filter(function (codePoints) {
|
||||
// Start with the set of Emoji with the light skin tone
|
||||
return /-1F3FB$/.test(codePoints);
|
||||
}).map(function (codePoints) {
|
||||
// Take the skin tone off
|
||||
return codePoints.replace(/-1F3FB$/, '');
|
||||
}).filter(function (baseCodePoints) {
|
||||
// Verify that all other skin tones + no skin tone are present
|
||||
return ['-1F3FC', '-1F3FD', '-1F3FE', '-1F3FF', ''].every(function (suffix) {
|
||||
return isPresent[baseCodePoints + suffix];
|
||||
});
|
||||
});
|
||||
console.log('[INFO] parsed ' + q.diversityBase.length + ' diversity emoji.');
|
||||
q.next();
|
||||
},
|
||||
|
||||
// detect complete sets of five skin tones and a base
|
||||
function partitionEmojiTypes(q) {
|
||||
console.log('partitioning emoji into types');
|
||||
q.zwj = [];
|
||||
q.diversity = [];
|
||||
q.sensitive = [];
|
||||
q.sensitiveKeycaps = [];
|
||||
q.diversitySensitive = [];
|
||||
q.regular = [];
|
||||
q.emojiSource.forEach(function (codePoints) {
|
||||
var u;
|
||||
var codePointsWithoutKeycap;
|
||||
codePoints = codePoints.replace(/\b[A-F0-9]+\b/g, function (hex) {
|
||||
// Pad all hex numbers to have at least 4 digits to match variantsSensitive
|
||||
return hex.length < 4 ? ('000' + hex).slice(-4) : hex;
|
||||
});
|
||||
if (q.ignore.indexOf(codePoints) < 0) {
|
||||
u = Utils.toJSON(codePoints);
|
||||
codePointsWithoutKeycap = codePoints.replace(/-20E3$/, '');
|
||||
if (codePoints.indexOf('200D') >= 0) {
|
||||
q.zwj.push(u);
|
||||
} else if (codePoints != codePointsWithoutKeycap && q.variantsSensitive.indexOf(codePointsWithoutKeycap) >= 0) {
|
||||
q.sensitiveKeycaps.push(Utils.toJSON(codePointsWithoutKeycap));
|
||||
} else if (q.diversityBase.indexOf(codePoints.replace(/-1F3F[B-F]$/, '')) >= 0) {
|
||||
// This is a diversity Emoji with or without a skin tone modifier
|
||||
// Add it to the regex if this is the base without the modifier
|
||||
if (q.diversityBase.indexOf(codePoints) >= 0) {
|
||||
if (q.variantsSensitive.indexOf(codePoints) < 0) {
|
||||
q.diversity.push(u);
|
||||
} else {
|
||||
q.diversitySensitive.push(u);
|
||||
}
|
||||
}
|
||||
} else if (q.variantsSensitive.indexOf(codePoints) < 0) {
|
||||
q.regular.push(u);
|
||||
} else {
|
||||
q.sensitive.push(u);
|
||||
}
|
||||
}
|
||||
});
|
||||
q.next();
|
||||
},
|
||||
|
||||
function factorZwjSequences(q) {
|
||||
q.zwjCommonPatterns = [];
|
||||
|
||||
// There are dozens of new ZWJ sequences that have common prefixes or suffixes with
|
||||
// skin tone + gender variations. To keep the main regex from growing excessively large and
|
||||
// slow, choose some common sub-expressions to factor.
|
||||
var commonPatterns = [
|
||||
{
|
||||
name: 'leading man/woman zwj with optional skin tone',
|
||||
re: '\\ud83d[\\udc68-\\udc69](?:\\ud83c[\\udffb-\\udfff])?\\u200d(.+?)',
|
||||
numCombinations: 12
|
||||
}, {
|
||||
name: 'variant or skin tone before trailing female/male zwj',
|
||||
re: '(.+?)(?:\\ufe0f|\\ud83c[\\udffb-\\udfff])\\u200d[\\u2640\\u2642]\\ufe0f',
|
||||
numCombinations: 12
|
||||
}, {
|
||||
name: 'optional skin tone before trailing female/male zwj',
|
||||
re: '(.+?)(?:\\ud83c[\\udffb-\\udfff])?\\u200d[\\u2640\\u2642]\\ufe0f',
|
||||
numCombinations: 12
|
||||
}
|
||||
];
|
||||
|
||||
commonPatterns.forEach(function(pattern) {
|
||||
var mapOfMatches = {};
|
||||
var re = new RegExp('^' + pattern.re + '$');
|
||||
q.zwj.forEach(function(jsonString) {
|
||||
var rawString = JSON.parse('"' + jsonString + '"');
|
||||
var match = rawString.match(re);
|
||||
if (match) {
|
||||
var key = match[1];
|
||||
mapOfMatches[key] = mapOfMatches[key] || [];
|
||||
mapOfMatches[key].push(match[0]);
|
||||
}
|
||||
});
|
||||
var replacements = [];
|
||||
Object.keys(mapOfMatches).forEach(function(key) {
|
||||
var matches = mapOfMatches[key];
|
||||
// Only a complete set may be replaced
|
||||
if (matches.length === pattern.numCombinations) {
|
||||
replacements.push(Utils.UTF162JSON(key));
|
||||
// Remove all items in the match set from the original zwj list
|
||||
matches.forEach(function(rawString) {
|
||||
var indexToRemove = q.zwj.indexOf(Utils.UTF162JSON(rawString));
|
||||
if (indexToRemove >= 0) {
|
||||
q.zwj.splice(indexToRemove, 1);
|
||||
}
|
||||
});
|
||||
}
|
||||
});
|
||||
if (replacements.length) {
|
||||
// Replace the wildcard section of the regex with a regex group of replacements
|
||||
var re = pattern.re.replace('(.+?', '(?:' + generateRegexPartial(replacements));
|
||||
q.zwjCommonPatterns.push(re);
|
||||
console.log('Refactoring ' + replacements.length + ' complete sets of ' + pattern.numCombinations + ' zwj from ' + pattern.name);
|
||||
} else {
|
||||
console.log('did not find any complete sets of ' + pattern.name);
|
||||
}
|
||||
});
|
||||
|
||||
q.next();
|
||||
},
|
||||
|
||||
// with all info, generate a RegExp that will catch
|
||||
// only standard emoji that are present in our assets
|
||||
function generateRegExp(q) {
|
||||
console.log('generating a RegExp for available assets');
|
||||
q.re = '';
|
||||
|
||||
// The Zero-width joiner common patterns, if present, need to come first
|
||||
if (q.zwjCommonPatterns.length) {
|
||||
q.re += q.zwjCommonPatterns.join('|') + '|';
|
||||
}
|
||||
|
||||
// Then the rest of the zwjs
|
||||
if (q.zwj.length) {
|
||||
q.re += generateRegexPartial(q.zwj) + '|';
|
||||
}
|
||||
|
||||
// Group the variant sensitive keycaps
|
||||
if (q.sensitiveKeycaps.length) {
|
||||
q.re += '(?:' + generateRegexPartial(q.sensitiveKeycaps) + ')\\ufe0f?\\u20e3|';
|
||||
}
|
||||
|
||||
// Next, add the diversity enabled Emoji that may include a skin tone suffix
|
||||
if (q.diversity.length + q.diversitySensitive.length) {
|
||||
q.re += '(?:';
|
||||
if (q.diversitySensitive.length) {
|
||||
// Some diversity are sensitive to variants
|
||||
q.re += '(?:' + generateRegexPartial(q.diversitySensitive) + ')(?:\\ufe0f|(?!\\ufe0e))';
|
||||
if (q.diversity.length) {
|
||||
q.re += '|';
|
||||
}
|
||||
}
|
||||
q.re += generateRegexPartial(q.diversity) + ')(?:' + generateRegexPartial(skinToneOptions) + '|)|';
|
||||
}
|
||||
|
||||
// Next, the normal Emoji
|
||||
q.re += generateRegexPartial(q.regular) + '|';
|
||||
|
||||
// Finally, add the rest of the sensitive ones that may be followed by U+FE0F but not U+FE0E
|
||||
q.re += '(?:' + generateRegexPartial(q.sensitive) + ')(?:\\ufe0f|(?!\\ufe0e))';
|
||||
q.next();
|
||||
},
|
||||
|
||||
function generateFile(q) {
|
||||
console.log('generating ./twemoji.js');
|
||||
createTwemoji(q.re);
|
||||
require('./create-dist');
|
||||
}
|
||||
|
||||
]);
|
||||
|
||||
|
||||
|
||||
function createTwemoji(re) {
|
||||
function createTwemoji() {
|
||||
fs.writeFileSync(
|
||||
file('2/twemoji.js'),
|
||||
'/*jslint indent: 2, browser: true, bitwise: true, plusplus: true */\n' +
|
||||
@ -1042,4 +599,8 @@ function createTwemoji(re) {
|
||||
) +
|
||||
'\n */'
|
||||
) + '());');
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
createTwemoji();
|
||||
require('./create-dist');
|
||||
|
Loading…
x
Reference in New Issue
Block a user