Import Emojibase data (#35229)

This commit is contained in:
Echo 2025-07-02 10:58:39 +02:00 committed by GitHub
parent b1375328e1
commit 52bc2f64f4
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 342 additions and 0 deletions

View File

@ -0,0 +1,64 @@
import { flattenEmojiData, SUPPORTED_LOCALES } from 'emojibase';
import emojiEnData from 'emojibase-data/en/compact.json';
import emojiFrData from 'emojibase-data/fr/compact.json';
import { toSupportedLocale, unicodeToLocaleLabel } from './locale';
describe('unicodeToLocaleLabel', () => {
const emojiTestCases = [
'1F3CB-1F3FF-200D-2640-FE0F', // 🏋🏿‍♀️ Woman weightlifter, dark skin
'1F468-1F3FB', // 👨🏻 Man, light skin
'1F469-1F3FB-200D-2695-FE0F', // 👩🏻‍⚕️ Woman health worker, light skin
'1F468-1F3FD-200D-1F692', // 👨🏽‍🚒 Man firefighter, medium skin
'1F469-1F3FE', // 👩🏾 Woman, medium-dark skin
'1F469-1F3FF-200D-1F4BB', // 👩🏿‍💻 Woman technologist, dark skin
'1F478-1F3FF', // 👸🏿 Princess with dark skin tone
'1F935-1F3FC-200D-2640-FE0F', // 🤵🏼‍♀️ Woman in tuxedo, medium-light skin
'1F9D1-1F3FC', // 🧑🏼 Person, medium-light skin
'1F9D4-1F3FE', // 🧔🏾 Person with beard, medium-dark skin
];
const flattenedEnData = flattenEmojiData(emojiEnData);
const flattenedFrData = flattenEmojiData(emojiFrData);
const emojiTestEnLabels = new Map(
emojiTestCases.map((code) => [
code,
flattenedEnData.find((emoji) => emoji.hexcode === code)?.label,
]),
);
const emojiTestFrLabels = new Map(
emojiTestCases.map((code) => [
code,
flattenedFrData.find((emoji) => emoji.hexcode === code)?.label,
]),
);
test.for(
emojiTestCases.flatMap((code) => [
[code, 'en', emojiTestEnLabels.get(code)],
[code, 'fr', emojiTestFrLabels.get(code)],
]) satisfies [string, string, string | undefined][],
)(
'returns correct label for %s for %s locale',
async ([unicodeHex, locale, expectedLabel]) => {
const label = await unicodeToLocaleLabel(unicodeHex, locale);
expect(label).toBe(expectedLabel);
},
);
});
describe('toSupportedLocale', () => {
test('returns the same locale if it is supported', () => {
for (const locale of SUPPORTED_LOCALES) {
expect(toSupportedLocale(locale)).toBe(locale);
}
});
test('returns "en" for unsupported locales', () => {
const unsupportedLocales = ['xx', 'fr-CA'];
for (const locale of unsupportedLocales) {
expect(toSupportedLocale(locale)).toBe('en');
}
});
});

View File

@ -0,0 +1,51 @@
import type { CompactEmoji, Locale } from 'emojibase';
import { flattenEmojiData, SUPPORTED_LOCALES } from 'emojibase';
// Simple cache. This will be replaced with an IndexedDB cache in the future.
const localeCache = new Map<Locale, Map<string, CompactEmoji>>();
export async function unicodeToLocaleLabel(
unicodeHex: string,
localeString: string,
) {
const locale = toSupportedLocale(localeString);
let hexMap = localeCache.get(locale);
if (!hexMap) {
hexMap = await loadLocaleLabels(locale);
localeCache.set(locale, hexMap);
}
const label = hexMap.get(unicodeHex)?.label;
if (!label) {
throw new Error(
`Label for unicode hex ${unicodeHex} not found in locale ${locale}`,
);
}
return label;
}
async function loadLocaleLabels(
locale: Locale,
): Promise<Map<string, CompactEmoji>> {
const { default: localeEmoji } = ((await import(
`emojibase-data/${locale}/compact.json`
)) ?? { default: [] }) as { default: CompactEmoji[] };
if (!Array.isArray(localeEmoji)) {
throw new Error(`Locale data for ${locale} not found`);
}
const hexMapEntries = flattenEmojiData(localeEmoji).map(
(emoji) => [emoji.hexcode, emoji] satisfies [string, CompactEmoji],
);
return new Map(hexMapEntries);
}
export function toSupportedLocale(locale: string): Locale {
if (isSupportedLocale(locale)) {
return locale;
}
return 'en'; // Default to English if unsupported
}
function isSupportedLocale(locale: string): locale is Locale {
return SUPPORTED_LOCALES.includes(locale as Locale);
}

View File

@ -0,0 +1,72 @@
import { readdir } from 'fs/promises';
import { basename, resolve } from 'path';
import unicodeEmojis from 'emojibase-data/en/data.json';
import { twemojiToUnicodeInfo, unicodeToTwemojiHex } from './normalize';
const emojiSVGFiles = await readdir(
// This assumes tests are run from project root
resolve(process.cwd(), 'public/emoji'),
{
withFileTypes: true,
},
);
const svgFileNames = emojiSVGFiles
.filter(
(file) =>
file.isFile() &&
file.name.endsWith('.svg') &&
!file.name.endsWith('_border.svg'),
)
.map((file) => basename(file.name, '.svg').toUpperCase());
describe('normalizeEmoji', () => {
describe('unicodeToSVGName', () => {
test.concurrent.for(
unicodeEmojis
// Our version of Twemoji only supports up to version 15.1
.filter((emoji) => emoji.version < 16)
.map((emoji) => [emoji.hexcode, emoji.label] as [string, string]),
)('verifying an emoji exists for %s (%s)', ([hexcode], { expect }) => {
const result = unicodeToTwemojiHex(hexcode);
expect(svgFileNames).toContain(result);
});
});
describe('twemojiToUnicodeInfo', () => {
const unicodeMap = new Map(
unicodeEmojis.flatMap((emoji) => {
const base: [string, string][] = [[emoji.hexcode, emoji.label]];
if (emoji.skins) {
base.push(
...emoji.skins.map(
(skin) => [skin.hexcode, skin.label] as [string, string],
),
);
}
return base;
}),
);
test.concurrent.for(svgFileNames)(
'verifying SVG file %s maps to Unicode emoji',
(svgFileName, { expect }) => {
assert(!!svgFileName);
const result = twemojiToUnicodeInfo(svgFileName);
const hexcode =
typeof result === 'string' ? result : result.unqualified;
if (!hexcode) {
// No hexcode means this is a special case like the Shibuya 109 emoji
expect(result).toHaveProperty('label');
return;
}
assert(!!hexcode);
expect(
unicodeMap.has(hexcode),
`${hexcode} (${svgFileName}) not found`,
).toBeTruthy();
},
);
});
});

View File

@ -0,0 +1,135 @@
// Utility codes
const VARIATION_SELECTOR_CODE = 0xfe0f;
const KEYCAP_CODE = 0x20e3;
// Gender codes
const GENDER_FEMALE_CODE = 0x2640;
const GENDER_MALE_CODE = 0x2642;
// Skin tone codes
const SKIN_TONE_CODES = [
0x1f3fb, // Light skin tone
0x1f3fc, // Medium-light skin tone
0x1f3fd, // Medium skin tone
0x1f3fe, // Medium-dark skin tone
0x1f3ff, // Dark skin tone
] as const;
// Misc codes that have special handling
const SKIER_CODE = 0x26f7;
const CHRISTMAS_TREE_CODE = 0x1f384;
const MR_CLAUS_CODE = 0x1f385;
const EYE_CODE = 0x1f441;
const LEVITATING_PERSON_CODE = 0x1f574;
const SPEECH_BUBBLE_CODE = 0x1f5e8;
const MS_CLAUS_CODE = 0x1f936;
export function unicodeToTwemojiHex(unicodeHex: string): string {
const codes = hexStringToNumbers(unicodeHex);
const normalizedCodes: number[] = [];
for (let i = 0; i < codes.length; i++) {
const code = codes[i];
if (!code) {
continue;
}
// Some emoji have their variation selector removed
if (code === VARIATION_SELECTOR_CODE) {
// Key emoji
if (i === 1 && codes.at(-1) === KEYCAP_CODE) {
continue;
}
// Eye in speech bubble
if (codes.at(0) === EYE_CODE && codes.at(-2) === SPEECH_BUBBLE_CODE) {
continue;
}
}
// This removes zero padding to correctly match the SVG filenames
normalizedCodes.push(code);
}
return hexNumbersToString(normalizedCodes, 0);
}
interface TwemojiSpecificEmoji {
unqualified?: string;
gender?: number;
skin?: number;
label?: string;
}
// Normalize man/woman to male/female
const GENDER_CODES_MAP: Record<number, number> = {
[GENDER_FEMALE_CODE]: GENDER_FEMALE_CODE,
[GENDER_MALE_CODE]: GENDER_MALE_CODE,
// These are man/woman markers, but are used for gender sometimes.
[0x1f468]: GENDER_MALE_CODE,
[0x1f469]: GENDER_FEMALE_CODE,
};
const TWEMOJI_SPECIAL_CASES: Record<string, string | TwemojiSpecificEmoji> = {
'1F441-200D-1F5E8': '1F441-FE0F-200D-1F5E8-FE0F', // Eye in speech bubble
// An emoji that was never ported to the Unicode standard.
// See: https://emojipedia.org/shibuya
E50A: { label: 'Shibuya 109' },
};
export function twemojiToUnicodeInfo(
twemojiHex: string,
): TwemojiSpecificEmoji | string {
const specialCase = TWEMOJI_SPECIAL_CASES[twemojiHex.toUpperCase()];
if (specialCase) {
return specialCase;
}
const codes = hexStringToNumbers(twemojiHex);
let gender: undefined | number;
let skin: undefined | number;
for (const code of codes) {
if (code in GENDER_CODES_MAP) {
gender = GENDER_CODES_MAP[code];
} else if (code in SKIN_TONE_CODES) {
skin = code;
}
}
let mappedCodes: unknown[] = codes;
if (codes.at(-1) === CHRISTMAS_TREE_CODE && codes.length >= 3 && gender) {
// Twemoji uses the christmas tree with a ZWJ for Mr. and Mrs. Claus,
// but in Unicode that only works for Mx. Claus.
const START_CODE =
gender === GENDER_FEMALE_CODE ? MS_CLAUS_CODE : MR_CLAUS_CODE;
mappedCodes = [START_CODE, skin];
} else if (codes.at(-1) === KEYCAP_CODE && codes.length === 2) {
// For key emoji, insert the variation selector
mappedCodes = [codes[0], VARIATION_SELECTOR_CODE, KEYCAP_CODE];
} else if (
codes.at(0) === SKIER_CODE ||
codes.at(0) === LEVITATING_PERSON_CODE
) {
// Twemoji offers more gender and skin options for the skier and levitating person emoji.
return {
unqualified: hexNumbersToString([codes.at(0)]),
skin,
gender,
};
}
return hexNumbersToString(mappedCodes);
}
function hexStringToNumbers(hexString: string): number[] {
return hexString
.split('-')
.map((code) => Number.parseInt(code, 16))
.filter((code) => !Number.isNaN(code));
}
function hexNumbersToString(codes: unknown[], padding = 4): string {
return codes
.filter(
(code): code is number =>
typeof code === 'number' && code > 0 && !Number.isNaN(code),
)
.map((code) => code.toString(16).padStart(padding, '0').toUpperCase())
.join('-');
}

View File

@ -66,6 +66,8 @@
"cross-env": "^7.0.3", "cross-env": "^7.0.3",
"detect-passive-events": "^2.0.3", "detect-passive-events": "^2.0.3",
"emoji-mart": "npm:emoji-mart-lazyload@latest", "emoji-mart": "npm:emoji-mart-lazyload@latest",
"emojibase": "^16.0.0",
"emojibase-data": "^16.0.3",
"escape-html": "^1.0.3", "escape-html": "^1.0.3",
"fuzzysort": "^3.0.0", "fuzzysort": "^3.0.0",
"history": "^4.10.1", "history": "^4.10.1",

View File

@ -2667,6 +2667,8 @@ __metadata:
cross-env: "npm:^7.0.3" cross-env: "npm:^7.0.3"
detect-passive-events: "npm:^2.0.3" detect-passive-events: "npm:^2.0.3"
emoji-mart: "npm:emoji-mart-lazyload@latest" emoji-mart: "npm:emoji-mart-lazyload@latest"
emojibase: "npm:^16.0.0"
emojibase-data: "npm:^16.0.3"
escape-html: "npm:^1.0.3" escape-html: "npm:^1.0.3"
eslint: "npm:^9.23.0" eslint: "npm:^9.23.0"
eslint-import-resolver-typescript: "npm:^4.2.5" eslint-import-resolver-typescript: "npm:^4.2.5"
@ -6533,6 +6535,22 @@ __metadata:
languageName: node languageName: node
linkType: hard linkType: hard
"emojibase-data@npm:^16.0.3":
version: 16.0.3
resolution: "emojibase-data@npm:16.0.3"
peerDependencies:
emojibase: "*"
checksum: 10c0/d82520917c2ec326e737da9c5a57472e41a719777fa4770b52b75f0568791613fc94829898831c7b3fff1528134de01019cdf34e571d214fee19e40950d68b7f
languageName: node
linkType: hard
"emojibase@npm:^16.0.0":
version: 16.0.0
resolution: "emojibase@npm:16.0.0"
checksum: 10c0/ec49ca2e131d349fa1f1dbe6ee8a6bf12da6225ce2de99d488e67a3cb80ac282f27aa480f0a7062c0c069c24508684ba524418be56b475cbd937877663686c47
languageName: node
linkType: hard
"encodeurl@npm:~1.0.2": "encodeurl@npm:~1.0.2":
version: 1.0.2 version: 1.0.2
resolution: "encodeurl@npm:1.0.2" resolution: "encodeurl@npm:1.0.2"