mirror of
https://github.com/mastodon/mastodon.git
synced 2025-07-15 16:58:14 +00:00
Import Emojibase data (#35229)
This commit is contained in:
parent
b1375328e1
commit
52bc2f64f4
64
app/javascript/mastodon/features/emoji/locale.test.ts
Normal file
64
app/javascript/mastodon/features/emoji/locale.test.ts
Normal file
|
@ -0,0 +1,64 @@
|
||||||
|
import { flattenEmojiData, SUPPORTED_LOCALES } from 'emojibase';
|
||||||
|
import emojiEnData from 'emojibase-data/en/compact.json';
|
||||||
|
import emojiFrData from 'emojibase-data/fr/compact.json';
|
||||||
|
|
||||||
|
import { toSupportedLocale, unicodeToLocaleLabel } from './locale';
|
||||||
|
|
||||||
|
describe('unicodeToLocaleLabel', () => {
|
||||||
|
const emojiTestCases = [
|
||||||
|
'1F3CB-1F3FF-200D-2640-FE0F', // 🏋🏿♀️ Woman weightlifter, dark skin
|
||||||
|
'1F468-1F3FB', // 👨🏻 Man, light skin
|
||||||
|
'1F469-1F3FB-200D-2695-FE0F', // 👩🏻⚕️ Woman health worker, light skin
|
||||||
|
'1F468-1F3FD-200D-1F692', // 👨🏽🚒 Man firefighter, medium skin
|
||||||
|
'1F469-1F3FE', // 👩🏾 Woman, medium-dark skin
|
||||||
|
'1F469-1F3FF-200D-1F4BB', // 👩🏿💻 Woman technologist, dark skin
|
||||||
|
'1F478-1F3FF', // 👸🏿 Princess with dark skin tone
|
||||||
|
'1F935-1F3FC-200D-2640-FE0F', // 🤵🏼♀️ Woman in tuxedo, medium-light skin
|
||||||
|
'1F9D1-1F3FC', // 🧑🏼 Person, medium-light skin
|
||||||
|
'1F9D4-1F3FE', // 🧔🏾 Person with beard, medium-dark skin
|
||||||
|
];
|
||||||
|
|
||||||
|
const flattenedEnData = flattenEmojiData(emojiEnData);
|
||||||
|
const flattenedFrData = flattenEmojiData(emojiFrData);
|
||||||
|
|
||||||
|
const emojiTestEnLabels = new Map(
|
||||||
|
emojiTestCases.map((code) => [
|
||||||
|
code,
|
||||||
|
flattenedEnData.find((emoji) => emoji.hexcode === code)?.label,
|
||||||
|
]),
|
||||||
|
);
|
||||||
|
const emojiTestFrLabels = new Map(
|
||||||
|
emojiTestCases.map((code) => [
|
||||||
|
code,
|
||||||
|
flattenedFrData.find((emoji) => emoji.hexcode === code)?.label,
|
||||||
|
]),
|
||||||
|
);
|
||||||
|
|
||||||
|
test.for(
|
||||||
|
emojiTestCases.flatMap((code) => [
|
||||||
|
[code, 'en', emojiTestEnLabels.get(code)],
|
||||||
|
[code, 'fr', emojiTestFrLabels.get(code)],
|
||||||
|
]) satisfies [string, string, string | undefined][],
|
||||||
|
)(
|
||||||
|
'returns correct label for %s for %s locale',
|
||||||
|
async ([unicodeHex, locale, expectedLabel]) => {
|
||||||
|
const label = await unicodeToLocaleLabel(unicodeHex, locale);
|
||||||
|
expect(label).toBe(expectedLabel);
|
||||||
|
},
|
||||||
|
);
|
||||||
|
});
|
||||||
|
|
||||||
|
describe('toSupportedLocale', () => {
|
||||||
|
test('returns the same locale if it is supported', () => {
|
||||||
|
for (const locale of SUPPORTED_LOCALES) {
|
||||||
|
expect(toSupportedLocale(locale)).toBe(locale);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
test('returns "en" for unsupported locales', () => {
|
||||||
|
const unsupportedLocales = ['xx', 'fr-CA'];
|
||||||
|
for (const locale of unsupportedLocales) {
|
||||||
|
expect(toSupportedLocale(locale)).toBe('en');
|
||||||
|
}
|
||||||
|
});
|
||||||
|
});
|
51
app/javascript/mastodon/features/emoji/locale.ts
Normal file
51
app/javascript/mastodon/features/emoji/locale.ts
Normal file
|
@ -0,0 +1,51 @@
|
||||||
|
import type { CompactEmoji, Locale } from 'emojibase';
|
||||||
|
import { flattenEmojiData, SUPPORTED_LOCALES } from 'emojibase';
|
||||||
|
|
||||||
|
// Simple cache. This will be replaced with an IndexedDB cache in the future.
|
||||||
|
const localeCache = new Map<Locale, Map<string, CompactEmoji>>();
|
||||||
|
|
||||||
|
export async function unicodeToLocaleLabel(
|
||||||
|
unicodeHex: string,
|
||||||
|
localeString: string,
|
||||||
|
) {
|
||||||
|
const locale = toSupportedLocale(localeString);
|
||||||
|
let hexMap = localeCache.get(locale);
|
||||||
|
if (!hexMap) {
|
||||||
|
hexMap = await loadLocaleLabels(locale);
|
||||||
|
localeCache.set(locale, hexMap);
|
||||||
|
}
|
||||||
|
|
||||||
|
const label = hexMap.get(unicodeHex)?.label;
|
||||||
|
if (!label) {
|
||||||
|
throw new Error(
|
||||||
|
`Label for unicode hex ${unicodeHex} not found in locale ${locale}`,
|
||||||
|
);
|
||||||
|
}
|
||||||
|
return label;
|
||||||
|
}
|
||||||
|
|
||||||
|
async function loadLocaleLabels(
|
||||||
|
locale: Locale,
|
||||||
|
): Promise<Map<string, CompactEmoji>> {
|
||||||
|
const { default: localeEmoji } = ((await import(
|
||||||
|
`emojibase-data/${locale}/compact.json`
|
||||||
|
)) ?? { default: [] }) as { default: CompactEmoji[] };
|
||||||
|
if (!Array.isArray(localeEmoji)) {
|
||||||
|
throw new Error(`Locale data for ${locale} not found`);
|
||||||
|
}
|
||||||
|
const hexMapEntries = flattenEmojiData(localeEmoji).map(
|
||||||
|
(emoji) => [emoji.hexcode, emoji] satisfies [string, CompactEmoji],
|
||||||
|
);
|
||||||
|
return new Map(hexMapEntries);
|
||||||
|
}
|
||||||
|
|
||||||
|
export function toSupportedLocale(locale: string): Locale {
|
||||||
|
if (isSupportedLocale(locale)) {
|
||||||
|
return locale;
|
||||||
|
}
|
||||||
|
return 'en'; // Default to English if unsupported
|
||||||
|
}
|
||||||
|
|
||||||
|
function isSupportedLocale(locale: string): locale is Locale {
|
||||||
|
return SUPPORTED_LOCALES.includes(locale as Locale);
|
||||||
|
}
|
72
app/javascript/mastodon/features/emoji/normalize.test.ts
Normal file
72
app/javascript/mastodon/features/emoji/normalize.test.ts
Normal file
|
@ -0,0 +1,72 @@
|
||||||
|
import { readdir } from 'fs/promises';
|
||||||
|
import { basename, resolve } from 'path';
|
||||||
|
|
||||||
|
import unicodeEmojis from 'emojibase-data/en/data.json';
|
||||||
|
|
||||||
|
import { twemojiToUnicodeInfo, unicodeToTwemojiHex } from './normalize';
|
||||||
|
|
||||||
|
const emojiSVGFiles = await readdir(
|
||||||
|
// This assumes tests are run from project root
|
||||||
|
resolve(process.cwd(), 'public/emoji'),
|
||||||
|
{
|
||||||
|
withFileTypes: true,
|
||||||
|
},
|
||||||
|
);
|
||||||
|
const svgFileNames = emojiSVGFiles
|
||||||
|
.filter(
|
||||||
|
(file) =>
|
||||||
|
file.isFile() &&
|
||||||
|
file.name.endsWith('.svg') &&
|
||||||
|
!file.name.endsWith('_border.svg'),
|
||||||
|
)
|
||||||
|
.map((file) => basename(file.name, '.svg').toUpperCase());
|
||||||
|
|
||||||
|
describe('normalizeEmoji', () => {
|
||||||
|
describe('unicodeToSVGName', () => {
|
||||||
|
test.concurrent.for(
|
||||||
|
unicodeEmojis
|
||||||
|
// Our version of Twemoji only supports up to version 15.1
|
||||||
|
.filter((emoji) => emoji.version < 16)
|
||||||
|
.map((emoji) => [emoji.hexcode, emoji.label] as [string, string]),
|
||||||
|
)('verifying an emoji exists for %s (%s)', ([hexcode], { expect }) => {
|
||||||
|
const result = unicodeToTwemojiHex(hexcode);
|
||||||
|
expect(svgFileNames).toContain(result);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
describe('twemojiToUnicodeInfo', () => {
|
||||||
|
const unicodeMap = new Map(
|
||||||
|
unicodeEmojis.flatMap((emoji) => {
|
||||||
|
const base: [string, string][] = [[emoji.hexcode, emoji.label]];
|
||||||
|
if (emoji.skins) {
|
||||||
|
base.push(
|
||||||
|
...emoji.skins.map(
|
||||||
|
(skin) => [skin.hexcode, skin.label] as [string, string],
|
||||||
|
),
|
||||||
|
);
|
||||||
|
}
|
||||||
|
return base;
|
||||||
|
}),
|
||||||
|
);
|
||||||
|
|
||||||
|
test.concurrent.for(svgFileNames)(
|
||||||
|
'verifying SVG file %s maps to Unicode emoji',
|
||||||
|
(svgFileName, { expect }) => {
|
||||||
|
assert(!!svgFileName);
|
||||||
|
const result = twemojiToUnicodeInfo(svgFileName);
|
||||||
|
const hexcode =
|
||||||
|
typeof result === 'string' ? result : result.unqualified;
|
||||||
|
if (!hexcode) {
|
||||||
|
// No hexcode means this is a special case like the Shibuya 109 emoji
|
||||||
|
expect(result).toHaveProperty('label');
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
assert(!!hexcode);
|
||||||
|
expect(
|
||||||
|
unicodeMap.has(hexcode),
|
||||||
|
`${hexcode} (${svgFileName}) not found`,
|
||||||
|
).toBeTruthy();
|
||||||
|
},
|
||||||
|
);
|
||||||
|
});
|
||||||
|
});
|
135
app/javascript/mastodon/features/emoji/normalize.ts
Normal file
135
app/javascript/mastodon/features/emoji/normalize.ts
Normal file
|
@ -0,0 +1,135 @@
|
||||||
|
// Utility codes
|
||||||
|
const VARIATION_SELECTOR_CODE = 0xfe0f;
|
||||||
|
const KEYCAP_CODE = 0x20e3;
|
||||||
|
|
||||||
|
// Gender codes
|
||||||
|
const GENDER_FEMALE_CODE = 0x2640;
|
||||||
|
const GENDER_MALE_CODE = 0x2642;
|
||||||
|
|
||||||
|
// Skin tone codes
|
||||||
|
const SKIN_TONE_CODES = [
|
||||||
|
0x1f3fb, // Light skin tone
|
||||||
|
0x1f3fc, // Medium-light skin tone
|
||||||
|
0x1f3fd, // Medium skin tone
|
||||||
|
0x1f3fe, // Medium-dark skin tone
|
||||||
|
0x1f3ff, // Dark skin tone
|
||||||
|
] as const;
|
||||||
|
|
||||||
|
// Misc codes that have special handling
|
||||||
|
const SKIER_CODE = 0x26f7;
|
||||||
|
const CHRISTMAS_TREE_CODE = 0x1f384;
|
||||||
|
const MR_CLAUS_CODE = 0x1f385;
|
||||||
|
const EYE_CODE = 0x1f441;
|
||||||
|
const LEVITATING_PERSON_CODE = 0x1f574;
|
||||||
|
const SPEECH_BUBBLE_CODE = 0x1f5e8;
|
||||||
|
const MS_CLAUS_CODE = 0x1f936;
|
||||||
|
|
||||||
|
export function unicodeToTwemojiHex(unicodeHex: string): string {
|
||||||
|
const codes = hexStringToNumbers(unicodeHex);
|
||||||
|
const normalizedCodes: number[] = [];
|
||||||
|
for (let i = 0; i < codes.length; i++) {
|
||||||
|
const code = codes[i];
|
||||||
|
if (!code) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
// Some emoji have their variation selector removed
|
||||||
|
if (code === VARIATION_SELECTOR_CODE) {
|
||||||
|
// Key emoji
|
||||||
|
if (i === 1 && codes.at(-1) === KEYCAP_CODE) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
// Eye in speech bubble
|
||||||
|
if (codes.at(0) === EYE_CODE && codes.at(-2) === SPEECH_BUBBLE_CODE) {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// This removes zero padding to correctly match the SVG filenames
|
||||||
|
normalizedCodes.push(code);
|
||||||
|
}
|
||||||
|
|
||||||
|
return hexNumbersToString(normalizedCodes, 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
interface TwemojiSpecificEmoji {
|
||||||
|
unqualified?: string;
|
||||||
|
gender?: number;
|
||||||
|
skin?: number;
|
||||||
|
label?: string;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Normalize man/woman to male/female
|
||||||
|
const GENDER_CODES_MAP: Record<number, number> = {
|
||||||
|
[GENDER_FEMALE_CODE]: GENDER_FEMALE_CODE,
|
||||||
|
[GENDER_MALE_CODE]: GENDER_MALE_CODE,
|
||||||
|
// These are man/woman markers, but are used for gender sometimes.
|
||||||
|
[0x1f468]: GENDER_MALE_CODE,
|
||||||
|
[0x1f469]: GENDER_FEMALE_CODE,
|
||||||
|
};
|
||||||
|
|
||||||
|
const TWEMOJI_SPECIAL_CASES: Record<string, string | TwemojiSpecificEmoji> = {
|
||||||
|
'1F441-200D-1F5E8': '1F441-FE0F-200D-1F5E8-FE0F', // Eye in speech bubble
|
||||||
|
// An emoji that was never ported to the Unicode standard.
|
||||||
|
// See: https://emojipedia.org/shibuya
|
||||||
|
E50A: { label: 'Shibuya 109' },
|
||||||
|
};
|
||||||
|
|
||||||
|
export function twemojiToUnicodeInfo(
|
||||||
|
twemojiHex: string,
|
||||||
|
): TwemojiSpecificEmoji | string {
|
||||||
|
const specialCase = TWEMOJI_SPECIAL_CASES[twemojiHex.toUpperCase()];
|
||||||
|
if (specialCase) {
|
||||||
|
return specialCase;
|
||||||
|
}
|
||||||
|
const codes = hexStringToNumbers(twemojiHex);
|
||||||
|
let gender: undefined | number;
|
||||||
|
let skin: undefined | number;
|
||||||
|
for (const code of codes) {
|
||||||
|
if (code in GENDER_CODES_MAP) {
|
||||||
|
gender = GENDER_CODES_MAP[code];
|
||||||
|
} else if (code in SKIN_TONE_CODES) {
|
||||||
|
skin = code;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
let mappedCodes: unknown[] = codes;
|
||||||
|
|
||||||
|
if (codes.at(-1) === CHRISTMAS_TREE_CODE && codes.length >= 3 && gender) {
|
||||||
|
// Twemoji uses the christmas tree with a ZWJ for Mr. and Mrs. Claus,
|
||||||
|
// but in Unicode that only works for Mx. Claus.
|
||||||
|
const START_CODE =
|
||||||
|
gender === GENDER_FEMALE_CODE ? MS_CLAUS_CODE : MR_CLAUS_CODE;
|
||||||
|
mappedCodes = [START_CODE, skin];
|
||||||
|
} else if (codes.at(-1) === KEYCAP_CODE && codes.length === 2) {
|
||||||
|
// For key emoji, insert the variation selector
|
||||||
|
mappedCodes = [codes[0], VARIATION_SELECTOR_CODE, KEYCAP_CODE];
|
||||||
|
} else if (
|
||||||
|
codes.at(0) === SKIER_CODE ||
|
||||||
|
codes.at(0) === LEVITATING_PERSON_CODE
|
||||||
|
) {
|
||||||
|
// Twemoji offers more gender and skin options for the skier and levitating person emoji.
|
||||||
|
return {
|
||||||
|
unqualified: hexNumbersToString([codes.at(0)]),
|
||||||
|
skin,
|
||||||
|
gender,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
return hexNumbersToString(mappedCodes);
|
||||||
|
}
|
||||||
|
|
||||||
|
function hexStringToNumbers(hexString: string): number[] {
|
||||||
|
return hexString
|
||||||
|
.split('-')
|
||||||
|
.map((code) => Number.parseInt(code, 16))
|
||||||
|
.filter((code) => !Number.isNaN(code));
|
||||||
|
}
|
||||||
|
|
||||||
|
function hexNumbersToString(codes: unknown[], padding = 4): string {
|
||||||
|
return codes
|
||||||
|
.filter(
|
||||||
|
(code): code is number =>
|
||||||
|
typeof code === 'number' && code > 0 && !Number.isNaN(code),
|
||||||
|
)
|
||||||
|
.map((code) => code.toString(16).padStart(padding, '0').toUpperCase())
|
||||||
|
.join('-');
|
||||||
|
}
|
|
@ -66,6 +66,8 @@
|
||||||
"cross-env": "^7.0.3",
|
"cross-env": "^7.0.3",
|
||||||
"detect-passive-events": "^2.0.3",
|
"detect-passive-events": "^2.0.3",
|
||||||
"emoji-mart": "npm:emoji-mart-lazyload@latest",
|
"emoji-mart": "npm:emoji-mart-lazyload@latest",
|
||||||
|
"emojibase": "^16.0.0",
|
||||||
|
"emojibase-data": "^16.0.3",
|
||||||
"escape-html": "^1.0.3",
|
"escape-html": "^1.0.3",
|
||||||
"fuzzysort": "^3.0.0",
|
"fuzzysort": "^3.0.0",
|
||||||
"history": "^4.10.1",
|
"history": "^4.10.1",
|
||||||
|
|
18
yarn.lock
18
yarn.lock
|
@ -2667,6 +2667,8 @@ __metadata:
|
||||||
cross-env: "npm:^7.0.3"
|
cross-env: "npm:^7.0.3"
|
||||||
detect-passive-events: "npm:^2.0.3"
|
detect-passive-events: "npm:^2.0.3"
|
||||||
emoji-mart: "npm:emoji-mart-lazyload@latest"
|
emoji-mart: "npm:emoji-mart-lazyload@latest"
|
||||||
|
emojibase: "npm:^16.0.0"
|
||||||
|
emojibase-data: "npm:^16.0.3"
|
||||||
escape-html: "npm:^1.0.3"
|
escape-html: "npm:^1.0.3"
|
||||||
eslint: "npm:^9.23.0"
|
eslint: "npm:^9.23.0"
|
||||||
eslint-import-resolver-typescript: "npm:^4.2.5"
|
eslint-import-resolver-typescript: "npm:^4.2.5"
|
||||||
|
@ -6533,6 +6535,22 @@ __metadata:
|
||||||
languageName: node
|
languageName: node
|
||||||
linkType: hard
|
linkType: hard
|
||||||
|
|
||||||
|
"emojibase-data@npm:^16.0.3":
|
||||||
|
version: 16.0.3
|
||||||
|
resolution: "emojibase-data@npm:16.0.3"
|
||||||
|
peerDependencies:
|
||||||
|
emojibase: "*"
|
||||||
|
checksum: 10c0/d82520917c2ec326e737da9c5a57472e41a719777fa4770b52b75f0568791613fc94829898831c7b3fff1528134de01019cdf34e571d214fee19e40950d68b7f
|
||||||
|
languageName: node
|
||||||
|
linkType: hard
|
||||||
|
|
||||||
|
"emojibase@npm:^16.0.0":
|
||||||
|
version: 16.0.0
|
||||||
|
resolution: "emojibase@npm:16.0.0"
|
||||||
|
checksum: 10c0/ec49ca2e131d349fa1f1dbe6ee8a6bf12da6225ce2de99d488e67a3cb80ac282f27aa480f0a7062c0c069c24508684ba524418be56b475cbd937877663686c47
|
||||||
|
languageName: node
|
||||||
|
linkType: hard
|
||||||
|
|
||||||
"encodeurl@npm:~1.0.2":
|
"encodeurl@npm:~1.0.2":
|
||||||
version: 1.0.2
|
version: 1.0.2
|
||||||
resolution: "encodeurl@npm:1.0.2"
|
resolution: "encodeurl@npm:1.0.2"
|
||||||
|
|
Loading…
Reference in New Issue
Block a user