1
0
mirror of https://github.com/twitter/twemoji.git synced 2024-10-01 12:24:23 +00:00

Merge pull request #125 from twuttke/char_ranges

fix sorting and use regex char classes
This commit is contained in:
Andrea Giammarchi 2016-03-02 08:15:03 +00:00
commit dfa6f8e8a9
5 changed files with 99 additions and 46 deletions

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

2
2/twemoji.min.js vendored

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -184,7 +184,7 @@ Queue([
},
// add our own assets that are not part of the Unicode standard
function addMissingEmojiAndSort(q) {
function addMissingEmoji(q) {
q.nonStandard = [];
Object.keys(assets).forEach(function (path, i) {
assets[path].forEach(function (emoji) {
@ -202,21 +202,8 @@ Queue([
// console.log(q.nonStandard.join(', '));
}
// order by sequence of chars length
q.emojiSource = q.emojiSource.concat(q.nonStandard).sort(sort);
// actually this is not needed
// q.variantsSensitive.sort(sort);
q.emojiSource = q.emojiSource.concat(q.nonStandard)
q.next();
function sort(a, b) {
var diff = b.length - a.length;
if (diff) return diff;
return parseInt(b.split('-')[0], 10) -
parseInt(a.split('-')[0], 10)
}
},
// detect complete sets of five skin tones and a base
@ -250,6 +237,13 @@ Queue([
var sensitive = [];
var sensitiveKeycaps = [];
var diversitySensitive = [];
var skinToneOptions = [
'\\ud83c\\udffb',
'\\ud83c\\udffc',
'\\ud83c\\udffd',
'\\ud83c\\udffe',
'\\ud83c\\udfff'
];
var regular = [];
q.emojiSource.forEach(function (codePoints) {
var u;
@ -287,12 +281,12 @@ Queue([
// The Zero-width joiner Emojis, if present, need to come first
if (zwj.length) {
q.re += zwj.join('|') + '|';
q.re += generateRegexPartial(zwj) + '|';
}
// Group the variant sensitive keycaps
if (sensitiveKeycaps.length) {
q.re += '(?:' + sensitiveKeycaps.join('|') + ')\\ufe0f?\\u20e3|';
q.re += '(?:' + generateRegexPartial(sensitiveKeycaps) + ')\\ufe0f?\\u20e3|';
}
// Next, add the diversity enabled Emoji that may include a skin tone suffix
@ -300,20 +294,19 @@ Queue([
q.re += '(?:';
if (diversitySensitive.length) {
// Some diversity are sensitive to variants
q.re += '(?:' + diversitySensitive.join('|') + ')(?:\\ufe0f|(?!\\ufe0e))';
q.re += '(?:' + generateRegexPartial(diversitySensitive) + ')(?:\\ufe0f|(?!\\ufe0e))';
if (diversity.length) {
q.re += '|';
}
}
q.re += diversity.join('|') + ')(?:\\ud83c\\udffb|\\ud83c\\udffc|\\ud83c\\udffd|\\ud83c\\udffe|\\ud83c\\udfff|)|';
q.re += generateRegexPartial(diversity) + ')(?:' + generateRegexPartial(skinToneOptions) + '|)|';
}
// Next, the normal Emoji
q.re += regular.join('|') + '|';
q.re += generateRegexPartial(regular) + '|';
// Finally, add the rest of the sensitive ones that may be followed by U+FE0F but not U+FE0E
q.re += '(?:' + sensitive.join('|') + ')(?:\\ufe0f|(?!\\ufe0e))';
q.re += '(?:' + generateRegexPartial(sensitive) + ')(?:\\ufe0f|(?!\\ufe0e))';
q.next();
// basic utilities to convert codepoints to JSON strings
@ -341,6 +334,70 @@ Queue([
return r.join('');
}
// Items is an array of unicode sequences with \u escaping, like ["\u2963\ufe0f", "\u263a\ufe0f"]
// items get sorted by length (long to short), then unicode hex values (high to low)
// output is "or" ed together using | for regex
// ouput also combines adjacent items using character classes with ranges when they have common prefixes
// Example: "aab", "aac", "aad", "aag", "ba" becomes "aa[b-dg]|ba"
function generateRegexPartial(items) {
var currentPrefix = null;
var result = [];
var charClass = [];
var charRange = [];
items.map(function (item) {
// Convert from "\u2963\ufe0f" into ["2963", "fe0f"]
return item.split('\\u').slice(1);
}).sort(sortMethod).forEach(function (itemParts) {
var prefix = itemParts.slice(0, -1).join('\\u');
if (prefix) {
prefix = '\\u' + prefix;
}
var suffix = itemParts.slice(-1);
if (prefix !== currentPrefix) {
flushCharClass();
}
currentPrefix = prefix;
var suffixMinusOne = UTF162JSON(String.fromCharCode(parseInt(suffix, 16) - 1));
if (charRange.length && charRange.slice(-1)[0] !== suffixMinusOne) {
flushCharRange();
}
charRange.push('\\u' + suffix);
});
flushCharClass();
return result.join('|');
// a and b are arrays of hex UCS-2 units
function sortMethod(a, b) {
return !a.length ? 0 :
b.length - a.length ||
parseInt(b[0], 16) - parseInt(a[0], 16) ||
sortMethod(b.slice(1), a.slice(1)
);
}
function flushCharRange() {
charClass = charClass.concat((charRange.length < 3) ?
charRange :
[ charRange[0], '-', charRange.slice(-1)[0] ]
);
charRange = [];
}
function flushCharClass() {
flushCharRange();
if (charClass.length) {
result.push(currentPrefix + (charClass.length == 1 ?
charClass[0] :
'[' + charClass.join('') + ']'
));
}
charClass = [];
currentPrefix = null;
}
}
},
function generateFile(q) {
@ -665,15 +722,14 @@ function createTwemoji(re) {
/**
* Used to both remove the possible variant
* and to convert utf16 into code points.
* If there is a zero-width-joiner, leave the variant in.
* If there is a zero-width-joiner (U+200D), leave the variants in.
* @param string the raw text of the emoji match
*/
function grabTheRightIcon(rawText) {
// if variant is present as \uFE0F
return toCodePoint(
rawText.indexOf('\u200D') < 0 ?
rawText.replace(/\uFE0F/g, '') :
rawText
return toCodePoint(/\u200D/.test(rawText) ?
rawText :
rawText.replace(/\uFE0F/g, '')
);
}