Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 9 additions & 4 deletions pipeline/process/nlp/Tokenizer.ts
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ import emojiRegex from "emoji-regex";
https://i.imgflip.com/4hkogk.jpg
*/

export type Tag = "code" | "url" | "mention" | "emoji" | "custom-emoji" | "word" | "unknown";
export type Tag = "code" | "url" | "email" | "mention" | "emoji" | "custom-emoji" | "word" | "unknown";

export interface Token {
text: string;
Expand Down Expand Up @@ -53,12 +53,17 @@ const Matchers: Readonly<TokenMatcher[]> = [
regex: /https?:\/\/[^\s<]+[^<.,:;"')\]\s]/g, // Discord's regex to match URLs
tag: "url",
},
// TODO: match emails, so they are not parsed as mentions (@gmail, @hotmail, etc)
// Emails matcher must be before @mentions matcher to avoid false positives
{
// Match emails
regex: /\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b/g,
tag: "email",
},
{
// match @mentions
regex: /@[\p{L}_0-9]+/giu,
regex: /(^|\s)@[\p{L}_0-9]+/giu,
tag: "mention",
transform: (match) => match.slice(1), // remove @
transform: (match) => match.trim().slice(1), // remove @
},
{
// match emojis 🔥
Expand Down
15 changes: 12 additions & 3 deletions tests/process/nlp/Tokenizer.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,9 @@ describe("should match the correct tag", () => {
// urls
["http://example.com", "http://example.com", "url"],
// mentions
["[email protected]", "[email protected]", "email"],
["@mention", "mention", "mention"],
[" @mention", "mention", "mention"], // with whitespace
["@123123123", "123123123", "mention"],
// emojis
["🔥", "🔥", "emoji"],
Expand All @@ -61,12 +63,19 @@ describe("should match the correct tag", () => {

test.each(cases)("%p → %p (tag=%p)", async (input, expectedText, expectedTag) => {
const tokens = tokenize(input);
expect(tokens.length).toBe(1);
expect(tokens[0].text).toBe(expectedText);
expect(tokens[0].tag).toBe(expectedTag);
expect(tokens).toStrictEqual([{ text: expectedText, tag: expectedTag }]);
});
});

it("should not classify 'abc@xyz' as an email or mention", () => {
const tokens = tokenize("abc@xyz");
expect(tokens).toStrictEqual([
{ text: "abc", tag: "word" },
{ text: "@", tag: "unknown" },
{ text: "xyz", tag: "word" },
]);
});

it("exclude outside ' matching words", () => {
const tokens = tokenize("'hello'");
expect(tokens.length).toBe(3);
Expand Down
Loading