From 113ab0a28dcfc0929ac3bcdf61d23b1a5a6e739c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carl-Gerhard=20Lindesva=CC=88rd?= Date: Wed, 2 Jul 2025 12:16:01 +0200 Subject: [PATCH 1/2] chore: update bots and referrers --- apps/api/src/bots/bots.ts | 520 ++++++++++++++++++----------- apps/worker/src/referrers/index.ts | 116 +++++-- package.json | 2 + 3 files changed, 416 insertions(+), 222 deletions(-) diff --git a/apps/api/src/bots/bots.ts b/apps/api/src/bots/bots.ts index 7b909690..c6933bdc 100644 --- a/apps/api/src/bots/bots.ts +++ b/apps/api/src/bots/bots.ts @@ -4,7 +4,7 @@ const bots = [ { - regex: 'WireReaderBot(?:/([\\d+.]+))?', + regex: 'WireReaderBot', name: 'WireReaderBot', category: 'Feed Fetcher', url: 'https://wirereader.app/', @@ -65,7 +65,7 @@ const bots = [ producer: { name: 'Ahrefs Pte Ltd', url: 'https://ahrefs.com/robot' }, }, { - regex: 'AhrefsSiteAudit/[\\d.]+', + regex: 'AhrefsSiteAudit', name: 'AhrefsSiteAudit', category: 'Site Monitor', url: 'https://ahrefs.com/robot/site-audit', @@ -86,14 +86,14 @@ const bots = [ producer: { name: 'Alexa Internet', url: 'https://www.alexa.com' }, }, { - regex: 'Amazonbot/[\\d.]+', + regex: 'Amazonbot', name: 'Amazon Bot', category: 'Crawler', url: 'https://developer.amazon.com/support/amazonbot', producer: { name: 'Amazon.com, Inc.', url: 'https://www.amazon.com/' }, }, { - regex: 'AmazonAdBot/[\\d.]+', + regex: 'AmazonAdBot', name: 'Amazon AdBot', category: 'Crawler', url: 'https://adbot.amazon.com/', @@ -576,7 +576,7 @@ const bots = [ producer: { name: 'Meta Platforms, Inc.', url: 'https://www.meta.com/' }, }, { - regex: 'FacebookBot/[\\d.]+', + regex: 'FacebookBot', name: 'FacebookBot', category: 'Crawler', url: 'https://developers.facebook.com/docs/sharing/bot', @@ -621,7 +621,7 @@ const bots = [ producer: { name: '', url: '' }, }, { - regex: 'Fever/[0-9]', + regex: 'Fever/', name: 'Fever', url: 'http://feedafever.com/', category: 'Feed Fetcher', @@ -863,7 +863,7 @@ const bots = [ url: 'https://vuhuv.com/bot.html', }, { - regex: 'HTTPMon/[\\d.]+', + regex: 'HTTPMon', name: 'HTTPMon', category: 'Site Monitor', url: 'http://www.httpmon.com', @@ -905,7 +905,7 @@ const bots = [ url: '', producer: { name: '', url: 'https://ip-guide.com' }, }, - { regex: 'k6/[0-9.]+', name: 'K6', url: 'https://k6.io/' }, + { regex: 'k6/', name: 'K6', url: 'https://k6.io/' }, { regex: 'kouio', name: 'Kouio', @@ -980,7 +980,7 @@ const bots = [ producer: { name: '', url: '' }, }, { - regex: 'masscan-ng/[\\d.]+', + regex: 'masscan-ng', name: 'masscan-ng', url: 'https://github.com/bi-zone/masscan-ng', category: 'Crawler', @@ -1136,7 +1136,7 @@ const bots = [ url: 'https://nodeping.com', producer: { name: 'NodePing', url: 'https://nodeping.com' }, }, - { regex: 'Octopus [0-9]', name: 'Octopus' }, + { regex: 'Octopus [\\d.]+', name: 'Octopus' }, { regex: 'OnlineOrNot\\.com_bot', name: 'OnlineOrNot Bot', @@ -1206,7 +1206,7 @@ const bots = [ }, }, { - regex: 'Pocket(?:ImageCache|Parser)/[\\d.]+', + regex: 'Pocket(?:ImageCache|Parser)', name: 'Pocket', category: 'Read-it-later Service', url: 'https://getpocket.com/pocketparser_ua', @@ -1347,7 +1347,14 @@ const bots = [ producer: { name: 'Semrush Inc.', url: 'https://www.semrush.com/' }, }, { - regex: 'SerpReputationManagementAgent/[\\d.]+', + regex: 'BacklinksExtendedBot', + name: 'BacklinksExtendedBot', + category: 'Crawler', + url: 'https://www.semrush.com/bot/', + producer: { name: 'Semrush Inc.', url: 'https://www.semrush.com/' }, + }, + { + regex: 'SerpReputationManagementAgent', name: 'Semrush Reputation Management', category: 'Service Agent', url: 'https://www.semrush.com/bot/', @@ -1361,7 +1368,7 @@ const bots = [ producer: { name: 'Semrush Inc.', url: 'https://www.semrush.com/' }, }, { - regex: 'SiteAuditBot/[\\d.]+', + regex: 'SiteAuditBot', name: 'SiteAuditBot', category: 'Crawler', url: 'https://www.semrush.com/bot/', @@ -1381,13 +1388,6 @@ const bots = [ url: 'http://www.seoengine.com/seoengbot.htm', producer: { name: 'SEO Engine', url: 'http://www.seoengine.com' }, }, - { - regex: 'SEOkicks-Robot', - name: 'SEOkicks-Robot', - category: 'Crawler', - url: 'http://www.seokicks.de/robot.html', - producer: { name: 'SEOkicks', url: 'https://www.seokicks.de/' }, - }, { regex: 'seoscanners\\.net', name: 'Seoscanners.net', @@ -1495,13 +1495,6 @@ const bots = [ url: '', producer: { name: 'Sprinklr, Inc.', url: 'https://www.sprinklr.com/' }, }, - { - regex: 'sqlmap/', - name: 'sqlmap', - category: 'Security Checker', - url: 'http://sqlmap.org/', - producer: { name: 'sqlmap', url: 'http://sqlmap.org/' }, - }, { regex: 'SSL Labs', name: 'SSL Labs', @@ -1527,7 +1520,7 @@ const bots = [ producer: { name: 'Superfeedr', url: 'https://superfeedr.com/' }, }, { - regex: 'Sparkler/[0-9]', + regex: 'Sparkler', name: 'Sparkler', category: 'Crawler', url: 'https://github.com/USCDataScience/sparkler', @@ -1660,7 +1653,7 @@ const bots = [ producer: { name: 'UkrNet Ltd', url: 'https://www.ukr.net/' }, }, { - regex: 'Uptime(?:bot)?/[\\d.]+', + regex: 'Uptime(?:bot)?/', name: 'Uptimebot', category: 'Site Monitor', url: 'https://uptime.com/uptime-bot', @@ -1798,7 +1791,7 @@ const bots = [ producer: { name: 'WPBeginner, LLC', url: 'https://www.wpbeginner.com/' }, }, { - regex: 'Automattic Analytics Crawler/[\\d.]+', + regex: 'Automattic Analytics Crawler', name: 'Automattic Analytics', category: 'Crawler', url: 'https://wordpress.com/crawler/', @@ -1925,7 +1918,7 @@ const bots = [ producer: { name: 'NetEase, Inc.', url: 'http://corp.163.com' }, }, { - regex: 'YOURLS v[0-9]', + regex: 'YOURLS', name: 'Yourls', category: 'Crawler', url: 'http://yourls.org', @@ -1986,7 +1979,7 @@ const bots = [ producer: { name: 'HubPages, Inc.', url: 'https://discover.hubpages.com/' }, }, { - regex: 'Pinterest(?:bot)?/[\\d.]+.*www\\.pinterest\\.com', + regex: 'Pinterest(?:bot)?/.*www\\.pinterest\\.com', name: 'Pinterest', url: 'https://help.pinterest.com/en/business/article/pinterest-crawler', category: 'Crawler', @@ -2000,7 +1993,7 @@ const bots = [ producer: { name: 'Site24x7', url: 'https://www.site24x7.com' }, }, { - regex: '.* HLB/[\\d.]+', + regex: '.* HLB', name: 'Site24x7 Defacement Monitor', category: 'Site Monitor', url: 'https://support.site24x7.com/portal/en/kb/articles/default-user-agent-used-in-website-defacement-monitor', @@ -2021,7 +2014,7 @@ const bots = [ producer: { name: 'Snapchat Inc.', url: 'https://www.snapchat.com/' }, }, { - regex: 'SnapchatAds/[\\d.]+', + regex: 'SnapchatAds', name: 'Snapchat Ads', category: 'Crawler', url: 'https://businesshelp.snapchat.com/s/article/adsbot-crawler?language=en_US', @@ -2143,7 +2136,7 @@ const bots = [ producer: { name: 'WooRank sprl', url: 'https://www.woorank.com/' }, }, { - regex: 'by Siteimprove\\.com', + regex: 'Siteimprove', name: 'Siteimprove', category: 'Search bot', url: 'https://siteimprove.com/', @@ -2195,11 +2188,11 @@ const bots = [ producer: { name: 'Effyis Inc', url: 'https://boardreader.com/' }, }, { - regex: 'IDG/IT', - name: 'IDG/IT', - category: 'Search bot', - url: 'https://spaziodati.eu/', - producer: { name: 'SpazioDati S.r.l.', url: 'https://spaziodati.eu/' }, + regex: 'IDG/(?:EU|IT|RU|UK)', + name: 'IDG', + category: 'Crawler', + url: 'https://www.spaziodati.eu/', + producer: { name: 'SpazioDati S.r.l.', url: 'https://www.spaziodati.eu/' }, }, { regex: 'Bytespider', @@ -2386,7 +2379,7 @@ const bots = [ producer: { name: 'PPC Labs LLC', url: 'https://www.adbeat.com/' }, }, { - regex: '(?:BuiltWith|BW)/[\\d.]+', + regex: '(?:BuiltWith|BW/)', name: 'BuiltWith', category: 'Crawler', url: 'https://builtwith.com/biup', @@ -2445,7 +2438,7 @@ const bots = [ url: 'http://cloudsystemnetworks.com', }, { - regex: 'HeartRails_Capture/[\\d.]+', + regex: 'HeartRails_Capture', name: 'Heart Rails Capture', category: 'Service Agent', url: 'http://capture.heartrails.com', @@ -2458,7 +2451,7 @@ const bots = [ producer: { name: 'RedHunt Labs Limited', url: 'https://redhuntlabs.com/' }, }, { - regex: 'DataXu/[\\d.]+', + regex: 'DataXu', name: 'DataXu', category: 'Service Agent', url: 'https://advertising.roku.com/dataxu', @@ -2584,13 +2577,13 @@ const bots = [ producer: { name: 'Hatena Co., Ltd.', url: 'https://www.hatena.ne.jp' }, }, { - regex: 'RyowlEngine/[\\d.]+', + regex: 'RyowlEngine', name: 'Ryowl', category: 'Crawler', url: 'https://ryowl.org', }, { - regex: 'OdklBot/[\\d.]+', + regex: 'OdklBot', name: 'Odnoklassniki Bot', category: 'Crawler', url: 'https://odnoklassniki.ru', @@ -2608,7 +2601,7 @@ const bots = [ url: 'https://www.zoominfo.com', }, { - regex: 'WeViKaBot/[\\d.]+', + regex: 'WeViKaBot', name: 'WeViKaBot', category: 'Crawler', url: 'http://www.wevika.de', @@ -2618,9 +2611,10 @@ const bots = [ name: 'SEOkicks', category: 'Crawler', url: 'https://www.seokicks.de/robot.html', + producer: { name: 'SEOkicks', url: 'https://www.seokicks.de/' }, }, { - regex: 'Plukkie/[\\d.]+', + regex: 'Plukkie', name: 'Plukkie', category: 'Crawler', url: 'http://www.botje.com/plukkie.htm', @@ -2632,25 +2626,25 @@ const bots = [ url: 'https://www.comscore.com/Web-Crawler', }, { - regex: 'SurdotlyBot/[\\d.]+', + regex: 'SurdotlyBot', name: 'SurdotlyBot', category: 'Crawler', url: 'http://sur.ly/bot.html', }, { - regex: 'Gowikibot/[\\d.]+', + regex: 'Gowikibot', name: 'Gowikibot', category: 'Crawler', url: 'http:/www.gowikibot.com', }, { - regex: 'SabsimBot/[\\d.]+', + regex: 'SabsimBot', name: 'SabsimBot', category: 'Crawler', url: 'https://sabsim.com', }, { - regex: 'LumtelBot/[\\d.]+', + regex: 'LumtelBot', name: 'LumtelBot', category: 'Crawler', url: 'https://umtel.com', @@ -2662,13 +2656,13 @@ const bots = [ url: 'http://www.pipl.com/bot', }, { - regex: 'woobot/[\\d.]+', + regex: 'woobot', name: 'WooRank', category: 'Crawler', url: 'https://www.woorank.com/bot', }, { - regex: 'Cookiebot/[\\d.]+', + regex: 'Cookiebot', name: 'Cookiebot', category: 'Crawler', url: 'https://support.cookiebot.com/hc/en-us/articles/360014264140-Scanner-User-Agent', @@ -2685,7 +2679,7 @@ const bots = [ }, }, { - regex: 'CensysInspect/[\\d.]+', + regex: 'CensysInspect', name: 'CensysInspect', category: 'Security Checker', url: 'https://about.censys.io/', @@ -2702,26 +2696,26 @@ const bots = [ }, }, { - regex: 'WellKnownBot/[\\d.]+', + regex: 'WellKnownBot', name: 'WellKnownBot', category: 'Crawler', url: 'https://well-known.dev', }, { - regex: 'Adsbot/[\\d.]+', + regex: 'Adsbot', name: 'Adsbot', category: 'Crawler', url: 'https://seostar.co/robot/', }, { - regex: 'MTRobot/[\\d.]+', + regex: 'MTRobot', name: 'MTRobot', category: 'Crawler', url: 'https://metrics-tools.de/robot.html', producer: { name: 'Metrics Tools', url: 'https://metrics-tools.de/' }, }, { - regex: 'serpstatbot/[\\d.]+', + regex: 'serpstatbot', name: 'serpstatbot', category: 'Crawler', url: 'http://serpstatbot.com/', @@ -2734,26 +2728,26 @@ const bots = [ url: 'https://github.com/gocolly/colly/', }, { - regex: 'l9tcpid/v[\\d.]+', + regex: 'l9tcpid', name: 'l9tcpid', category: 'Security Checker', url: 'https://github.com/LeakIX/l9tcpid', }, { - regex: 'l9explore/[\\d.]+', + regex: 'l9explore', name: 'l9explore', category: 'Security Checker', url: 'https://github.com/LeakIX/l9explore', }, { - regex: 'l9scan/|^Lkx-.*/[\\d.]+', + regex: 'l9scan/|^Lkx-.*/', name: 'LeakIX', category: 'Security Checker', url: 'https://leakix.net/', producer: { name: 'BaDaaS SRL', url: 'https://leakix.net/' }, }, { - regex: 'MegaIndex\\.ru/[\\d.]+', + regex: 'MegaIndex\\.ru', name: 'MegaIndex', category: 'Crawler', url: 'https://megaindex.com/crawler', @@ -2766,60 +2760,67 @@ const bots = [ producer: { name: 'SISTRIX GmbH', url: 'https://www.sistrix.de/' }, }, { - regex: 'seolyt/[\\d.]+', - name: 'seolyt', + regex: 'Seolyt(?:Bot)?', + name: 'SeolytBot', category: 'Crawler', url: 'https://seolyt.com/', }, { - regex: 'YaK/[\\d.]+', + regex: 'YaK/', name: 'YaK', category: 'Crawler', url: 'https://www.linkfluence.com/', producer: { name: 'Linkfluence SAS', url: 'https://www.linkfluence.com/' }, }, { - regex: 'KomodiaBot/[\\d.]+', + regex: 'KomodiaBot', name: 'KomodiaBot', category: 'Crawler', url: 'http://www.komodia.com/newwiki/index.php/URL_server_crawler', producer: { name: 'Komodia Inc.', url: 'https://www.komodia.com/' }, }, { - regex: 'KStandBot/[\\d.]+', + regex: 'KStandBot', name: 'KStandBot', category: 'Crawler', url: 'https://url-classification.io/wiki/index.php?title=URL_server_crawler', producer: { name: 'Komodia Inc.', url: 'https://www.komodia.com/' }, }, { - regex: 'Neevabot/[\\d.]+', + regex: 'Neevabot', name: 'Neevabot', category: 'Search bot', url: 'https://neeva.com/neevabot', producer: { name: 'Neeva Inc.', url: 'https://neeva.com/' }, }, { - regex: 'LinkPreview/[\\d.]+', + regex: 'Chatwork LinkPreview', + name: 'Chatwork LinkPreview', + category: 'Service Agent', + url: 'https://go.chatwork.com/en/', + producer: { name: 'kubell Co., Ltd.', url: 'https://www.kubell.com/en/' }, + }, + { + regex: 'LinkPreview', name: 'LinkPreview', category: 'Service Agent', url: 'https://www.linkpreview.net/', }, { - regex: 'JungleKeyThumbnail/[\\d.]+', + regex: 'JungleKeyThumbnail', name: 'JungleKeyThumbnail', category: 'Crawler', url: 'https://junglekey.com/', }, { - regex: 'rocketmonitor(?: |bot/)[\\d.]+', + regex: 'rocketmonitor(?:bot)?', name: 'RocketMonitorBot', category: 'Site Monitor', url: 'https://www.radiomast.io/docs/stream-monitoring/technical_details.html', producer: { name: 'Radio Mast, Inc.', url: 'https://www.radiomast.io/' }, }, { - regex: 'SitemapParser-VIPnytt/[\\d.]+', + regex: 'SitemapParser-VIPnytt', name: 'SitemapParser-VIPnytt', category: 'Crawler', url: 'https://github.com/VIPnytt/SitemapParser/', @@ -2831,7 +2832,7 @@ const bots = [ url: 'https://turnitin.com/robot/crawlerinfo.html', }, { - regex: 'DMBrowser/[\\d.]+|DMBrowser-[UB]V', + regex: 'DMBrowser|DMBrowser-[UB]V', name: 'Dotcom Monitor', category: 'Site Monitor', url: 'https://www.dotcom-monitor.com', @@ -2844,19 +2845,19 @@ const bots = [ url: 'https://dataforseo.com/dataforseo-bot', }, { - regex: 'Discordbot/[\\d.]+', + regex: 'Discordbot', name: 'Discord Bot', category: 'Service Agent', url: 'https://discordapp.com', }, { - regex: 'Linespider/[\\d.]+', + regex: 'Linespider', name: 'Linespider', category: 'Crawler', url: 'https://lin.ee/4dwXkTH', }, { - regex: 'Cincraw/[\\d.]+', + regex: 'Cincraw', name: 'Cincraw', category: 'Crawler', url: 'http://cincrawdata.net/bot/', @@ -2895,19 +2896,19 @@ const bots = [ }, }, { - regex: 'TigerBot/[\\d.]+', + regex: 'TigerBot', name: 'TigerBot', category: 'Crawler', url: 'https://tiger.ch/', }, { - regex: 'TestCrawler/[\\d.]+', + regex: 'TestCrawler', name: 'TestCrawler', category: 'Crawler', url: 'https://www.comcepta.com/', }, { - regex: 'CrowdTanglebot/[\\d.]+', + regex: 'CrowdTanglebot', name: 'CrowdTangle', category: 'Crawler', url: 'https://help.crowdtangle.com/en/articles/3009319-crowdtangle-bot', @@ -2941,14 +2942,14 @@ const bots = [ producer: { name: 'deepnoc, GmbH', url: 'https://deepnoc.com/' }, }, { - regex: 'Newslitbot/[\\d.]+', + regex: 'Newslitbot', name: 'Newslitbot', category: 'Crawler', url: 'https://www.newslit.co/', producer: { name: 'Newslit, LLC.', url: 'https://www.newslit.co/' }, }, { - regex: 'um-(?:ANS|CC|FC|IC|LN)/[\\d.]+', + regex: 'um-(?:ANS|CC|FC|IC|LN)', name: 'uMBot', category: 'Crawler', url: 'https://www.ubermetrics-technologies.com/', @@ -2958,7 +2959,7 @@ const bots = [ }, }, { - regex: 'Abonti/[\\d.]+', + regex: 'Abonti', name: 'Abonti', category: 'Crawler', url: 'http://abonti.com/', @@ -2981,7 +2982,7 @@ const bots = [ }, }, { - regex: 'ev-crawler/[\\d.]+', + regex: 'ev-crawler', name: 'Headline', category: 'Crawler', url: 'https://headline.com/legal/crawler', @@ -2991,7 +2992,7 @@ const bots = [ }, }, { - regex: 'webprosbot/[\\d.]+', + regex: 'webprosbot', name: 'WebPros', category: 'Crawler', url: 'https://webpros.com/', @@ -3005,7 +3006,7 @@ const bots = [ producer: { name: 'Amazon.com, Inc.', url: 'https://www.amazon.com/' }, }, { - regex: 'Wheregoes\\.com Redirect Checker/[\\d.]+', + regex: 'Wheregoes\\.com Redirect Checker', name: 'WhereGoes', category: 'Crawler', url: 'https://wheregoes.com/', @@ -3017,13 +3018,13 @@ const bots = [ url: 'http://66.240.192.82/', }, { - regex: 'InternetMeasurement/[\\d.]+', + regex: 'InternetMeasurement', name: 'InternetMeasurement', category: 'Crawler', url: 'https://internet-measurement.com/', }, { - regex: 'DomainAppender /[\\d.]+', + regex: 'DomainAppender', name: 'DomainAppender', category: 'Crawler', url: 'https://www.profound.net/product/domain_append/', @@ -3033,7 +3034,7 @@ const bots = [ }, }, { - regex: 'FreeWebMonitoring SiteChecker/[\\d.]+', + regex: 'FreeWebMonitoring SiteChecker', name: 'FreeWebMonitoring', category: 'Site Monitor', url: 'https://www.freewebmonitoring.com/bot.html', @@ -3060,21 +3061,21 @@ const bots = [ producer: { name: 'Jaohawi AB', url: 'https://adstxtlab.com/' }, }, { - regex: 'Iframely/[\\d.]+', + regex: 'Iframely', name: 'Iframely', category: 'Crawler', url: 'https://iframely.com/', producer: { name: 'Itteco Software, Corp.', url: 'https://iframely.com/' }, }, { - regex: 'DomainStatsBot/[\\d.]+', + regex: 'DomainStatsBot', name: 'DomainStatsBot', category: 'Crawler', url: 'https://domainstats.com/pages/our-bot', producer: { name: 'Domainstats Ltd', url: 'https://domainstats.com/' }, }, { - regex: 'aiHitBot/[\\d.]+', + regex: 'aiHitBot', name: 'aiHitBot', category: 'Crawler', url: 'https://www.aihitdata.com/about', @@ -3088,7 +3089,7 @@ const bots = [ { regex: 'DNSResearchBot', name: 'DNSResearchBot', category: 'Crawler' }, { regex: 'GitCrawlerBot', name: 'GitCrawlerBot', category: 'Crawler' }, { - regex: 'AdAuth/[\\d.]+', + regex: 'AdAuth', name: 'AdAuth', category: 'Crawler', url: 'https://www.adauth.com', @@ -3166,7 +3167,7 @@ const bots = [ }, }, { - regex: 'ScamadviserExternalHit/[\\d.]+', + regex: 'ScamadviserExternalHit', name: 'Scamadviser External Hit', category: 'Crawler', url: 'https://www.scamadviser.com/', @@ -3183,26 +3184,20 @@ const bots = [ producer: { name: 'Zaldamo, LLC.', url: 'https://www.zaldamo.com/' }, }, { - regex: 'AFB/[\\d.]+', + regex: 'AFB', name: 'Allloadin Favicon Bot', category: 'Crawler', url: 'https://allloadin.com/', }, { - regex: 'SeolytBot/[\\d.]+', - name: 'Seolyt Bot', - category: 'Crawler', - url: 'https://seolyt.com', - }, - { - regex: 'LinkWalker/[\\d.]+', + regex: 'LinkWalker', name: 'LinkWalker', category: 'Crawler', url: 'https://www.phishlabs.com/', producer: { name: 'PhishLabs, Inc.', url: 'https://www.phishlabs.com/' }, }, { - regex: 'RenovateBot/[\\d.]+', + regex: 'RenovateBot', name: 'RenovateBot', category: 'Security Checker', url: 'https://github.com/renovatebot/renovate', @@ -3212,7 +3207,7 @@ const bots = [ }, }, { - regex: 'INETDEX-BOT/[\\d.]+', + regex: 'INETDEX-BOT', name: 'Inetdex Bot', category: 'Crawler', url: 'https://www.inetdex.com/', @@ -3242,28 +3237,28 @@ const bots = [ }, }, { - regex: 'Nicecrawler/[\\d.]+', + regex: 'Nicecrawler', name: 'NiceCrawler', category: 'Crawler', url: 'https://www.nicecrawler.com/', producer: { name: 'Intelium Corp.', url: 'https://www.intelium.com/' }, }, { - regex: 't3versionsBot/[\\d.]+', + regex: 't3versionsBot', name: 't3versions', category: 'Crawler', url: 'https://www.t3versions.com/bot', producer: { name: 'Torben Hansen', url: 'https://www.t3versions.com/' }, }, { - regex: 'Crawlson/[\\d.]+', + regex: 'Crawlson', name: 'Crawlson', category: 'Crawler', url: 'https://www.crawlson.com/about', producer: { name: 'Crawlson', url: 'https://www.crawlson.com/' }, }, { - regex: 'tchelebi/[\\d.]+', + regex: 'tchelebi', name: 'tchelebi', category: 'Crawler', url: 'https://tchelebi.io/', @@ -3277,7 +3272,7 @@ const bots = [ producer: { name: 'New Work SE', url: 'https://www.xing.com/' }, }, { - regex: 'RepoLookoutBot/v?[\\d.]+', + regex: 'RepoLookoutBot', name: 'Repo Lookout', category: 'Security Checker', url: 'https://www.repo-lookout.org/', @@ -3291,7 +3286,7 @@ const bots = [ producer: { name: 'MAMI Project', url: 'https://mami-project.eu/' }, }, { - regex: 'everyfeed-spider/[\\d.]+', + regex: 'everyfeed-spider', name: 'Everyfeed', url: 'https://web.archive.org/web/20050930235914/http://www.everyfeed.com/', category: 'Feed Fetcher', @@ -3312,7 +3307,7 @@ const bots = [ producer: { name: '', url: '' }, }, { - regex: 'Gregarius/[\\d.]+', + regex: 'Gregarius', name: 'Gregarius', category: 'Feed Fetcher', url: 'https://web.archive.org/web/20100614011837/http://devlog.gregarius.net/docs/ua/', @@ -3336,22 +3331,21 @@ const bots = [ producer: { name: 'Sectigo Limited', url: 'https://sectigo.com/' }, }, { - regex: - 'KlarnaBot-(?:DownloadProductImage|EnrichProducts|PriceWatcher)/[\\d.]+', + regex: 'KlarnaBot-(?:DownloadProductImage|EnrichProducts|PriceWatcher)', name: 'KlarnaBot', category: 'Crawler', url: 'https://docs.klarna.com/klarna-bot/', producer: { name: 'Klarna Bank AB', url: 'https://www.klarna.com/' }, }, { - regex: 'Taboolabot/[\\d.]+', + regex: 'Taboolabot', name: 'Taboolabot', category: 'Crawler', url: 'https://help.taboola.com/hc/en-us/articles/115002347594-The-Taboola-Crawler', producer: { name: 'Taboola, Inc.', url: 'https://www.taboola.com/' }, }, { - regex: 'Asana/[\\d.]+', + regex: 'Asana', name: 'Asana', category: 'Crawler', url: 'https://asana.com/', @@ -3365,7 +3359,7 @@ const bots = [ producer: { name: 'Google Inc.', url: 'https://www.google.com/' }, }, { - regex: 'URLinspectorBot/[\\d.]+', + regex: 'URLinspectorBot', name: 'URLinspector', category: 'Site Monitor', url: 'https://www.urlinspector.com/bot/', @@ -3375,14 +3369,14 @@ const bots = [ }, }, { - regex: 'EntferBot/[\\d.]+', + regex: 'EntferBot', name: 'Entfer', category: 'Crawler', url: 'https://entfer.com/', producer: { name: 'Entfer Ltd.', url: 'https://entfer.com/' }, }, { - regex: 'TagInspector/[\\d.]+', + regex: 'TagInspector', name: 'Tag Inspector', category: 'Crawler', url: 'https://taginspector.com/', @@ -3406,46 +3400,46 @@ const bots = [ }, }, { - regex: 'DisqusAdstxtCrawler/[\\d.]+', + regex: 'DisqusAdstxtCrawler', name: 'Disqus', category: 'Crawler', url: 'https://help.disqus.com/en/articles/1765357-ads-txt-implementation-guide', producer: { name: 'Disqus, Inc.', url: 'https://disqus.com/' }, }, { - regex: 'startmebot/[\\d.]+', + regex: 'startmebot', name: 'start.me', category: 'Crawler', url: 'https://about.start.me/', producer: { name: 'start.me BV', url: 'https://about.start.me/' }, }, { - regex: '2ip bot/[\\d.]+', + regex: '2ip bot', name: '2ip', category: 'Crawler', url: 'https://2ip.io/', }, { - regex: 'ReqBin Curl Client/[\\d.]+', + regex: 'ReqBin Curl Client', name: 'ReqBin', category: 'Crawler', url: 'https://reqbin.com/curl', }, { - regex: 'XoviBot/[\\d.]+', + regex: 'XoviBot', name: 'XoviBot', category: 'Crawler', url: 'https://www.xovibot.net', producer: { name: 'Xovi GmbH', url: 'http://www.xovi.de' }, }, { - regex: 'Overcast/[\\d.]+ Podcast Sync', + regex: 'Overcast/.+Podcast Sync', name: 'Overcast Podcast Sync', category: 'Service Agent', url: 'https://overcast.fm/podcasterinfo', }, { - regex: '^Verity/[\\d.]+', + regex: '^Verity', name: 'GumGum Verity', category: 'Service Agent', url: 'https://gumgum.com/verity', @@ -3457,7 +3451,7 @@ const bots = [ url: 'https://github.com/snarfed/hackermention', }, { - regex: 'BitSightBot/[\\d.]+', + regex: 'BitSightBot', name: 'BitSight', category: 'Security Checker', url: 'https://www.bitsight.com/', @@ -3467,7 +3461,7 @@ const bots = [ }, }, { - regex: 'Ezgif/[\\d.]+', + regex: 'Ezgif', name: 'Ezgif', category: 'Service Agent', url: 'https://ezgif.com/about', @@ -3483,7 +3477,7 @@ const bots = [ }, }, { - regex: 'FemtosearchBot/[\\d.]+', + regex: 'FemtosearchBot', name: 'Femtosearch', category: 'Crawler', url: 'http://femtosearch.com/', @@ -3493,7 +3487,7 @@ const bots = [ }, }, { - regex: 'AdsTxtCrawler/[\\d.]+', + regex: 'AdsTxtCrawler/', name: 'AdsTxtCrawler', category: 'Crawler', url: 'https://github.com/InteractiveAdvertisingBureau/adstxtcrawler', @@ -3510,7 +3504,7 @@ const bots = [ producer: { name: 'Morningscore', url: 'https://morningscore.io/' }, }, { - regex: 'Uptime-Kuma/[\\d.]+', + regex: 'Uptime-Kuma', name: 'Uptime-Kuma', category: 'Site Monitor', url: 'https://github.com/louislam/uptime-kuma', @@ -3523,7 +3517,7 @@ const bots = [ producer: { name: 'OpenAI OpCo, LLC', url: 'https://openai.com/' }, }, { - regex: 'GPTBot/[\\d.]+', + regex: 'GPTBot', name: 'GPTBot', category: 'Crawler', url: 'https://platform.openai.com/docs/bots', @@ -3537,7 +3531,7 @@ const bots = [ producer: { name: 'OpenAI OpCo, LLC', url: 'https://openai.com/' }, }, { - regex: 'BrightEdge Crawler/[\\d.]+', + regex: 'BrightEdge Crawler', name: 'BrightEdge', category: 'Crawler', url: 'https://www.brightedge.com/', @@ -3547,7 +3541,7 @@ const bots = [ }, }, { - regex: 'sfFeedReader/[\\d.]+', + regex: 'sfFeedReader', name: 'sfFeedReader', url: 'https://github.com/diem-project/sfFeed2Plugin', category: 'Feed Fetcher', @@ -3570,21 +3564,21 @@ const bots = [ }, }, { - regex: 'newspaper/[\\d.]+', + regex: 'newspaper', name: 'Scraping Robot', category: 'Crawler', url: 'https://scrapingrobot.com/', producer: { name: 'Sprious LLC', url: 'https://sprious.com/' }, }, { - regex: 'Ant(?:\\.com beta|Bot)(?:/([\\d+.]+))?', + regex: 'Ant(?:\\.com beta|Bot)', name: 'Ant', category: 'Crawler', url: 'https://www.ant.com/', producer: { name: 'Ant.com Ltd.', url: 'https://www.ant.com/' }, }, { - regex: 'WebwikiBot/[\\d.]+', + regex: 'WebwikiBot', name: 'Webwiki', category: 'Crawler', url: 'https://www.webwiki.com/', @@ -3604,7 +3598,7 @@ const bots = [ producer: { name: 'InnoCraft Ltd', url: 'https://matomo.org/' }, }, { - regex: 'Prometheus/[\\d.]+', + regex: 'Prometheus', name: 'Prometheus', category: 'Service Agent', url: 'https://github.com/prometheus/prometheus', @@ -3618,7 +3612,7 @@ const bots = [ producer: { name: 'ArchiveTeam', url: 'https://wiki.archiveteam.org/' }, }, { - regex: 'MADBbot/[\\d.]+', + regex: 'MADBbot', name: 'MADBbot', category: 'Crawler', url: 'https://madb.zapto.org/bot.html', @@ -3677,7 +3671,7 @@ const bots = [ producer: { name: 'Tactikast' }, }, { - regex: 'Brightbot ([\\d+.]+)', + regex: 'Brightbot', name: 'BrightBot', category: 'Crawler', url: 'https://www.brightbot.app/', @@ -3687,21 +3681,21 @@ const bots = [ }, }, { - regex: 'DaspeedBot/([\\d+.]+)', + regex: 'DaspeedBot', name: 'DaspeedBot', category: 'Crawler', url: 'https://daspeed.io/', producer: { name: 'DAWAP SARL', url: 'https://dawap.fr/' }, }, { - regex: 'StractBot(?:/([\\d+.]+))?', + regex: 'StractBot', name: 'Stract', category: 'Crawler', url: 'https://stract.com/webmasters', producer: { name: 'Stract', url: 'https://github.com/StractOrg/stract/' }, }, { - regex: 'GeedoBot(?:/([\\d+.]+))?', + regex: 'GeedoBot', name: 'GeedoBot', category: 'Crawler', url: 'https://geedo.com/bot/', @@ -3713,14 +3707,14 @@ const bots = [ url: 'https://geedo.com/product-search/', }, { - regex: 'BackupLand(?:/([\\d+.]+))?', + regex: 'BackupLand', name: 'BackupLand', category: 'Crawler', url: 'https://go.backupland.com/', producer: { name: 'ООО «КВАРТА»', url: 'https://go.backupland.com/' }, }, { - regex: 'Konturbot(?:/([\\d+.]+))?', + regex: 'Konturbot', name: 'Konturbot', category: 'Crawler', url: 'https://kontur.ru/', @@ -3734,19 +3728,19 @@ const bots = [ producer: { name: 'ООО «МОДЕСКО»', url: 'https://www.modesco.ru/' }, }, { - regex: 'LetsearchBot(?:/([\\d+.]+))?', + regex: 'LetsearchBot', name: 'LetSearch', category: 'Crawler', url: 'https://letsearch.ru/bots', }, { - regex: 'Example3(?:/([\\d+.]+))?', + regex: 'Example3', name: 'Example3', category: 'Crawler', url: 'https://www.example3.com/', }, { - regex: 'StatOnlineRuBot(?:/([\\d+.]+))?', + regex: 'StatOnlineRuBot', name: 'StatOnline.ru', category: 'Crawler', url: 'https://statonline.ru/', @@ -3829,14 +3823,14 @@ const bots = [ producer: { name: 'Anthropic, PBC', url: 'https://www.anthropic.com/' }, }, { - regex: 'NetpeakCheckerBot/[\\d.]+', + regex: 'NetpeakCheckerBot', name: 'Netpeak Checker', category: 'Crawler', url: 'https://netpeaksoftware.com/checker', producer: { name: 'Netpeak LTD', url: 'https://netpeaksoftware.com/' }, }, { - regex: 'SandobaCrawler/[\\d.]+', + regex: 'SandobaCrawler', name: 'Sandoba//Crawler', category: 'Crawler', url: 'https://www.sandoba.com/en/crawler/', @@ -3853,7 +3847,7 @@ const bots = [ producer: { name: 'Sirdata SAS', url: 'https://www.sirdata.com/' }, }, { - regex: 'CheckMarkNetwork/[\\d.]+', + regex: 'CheckMarkNetwork', name: 'CheckMark Network', category: 'Crawler', url: 'https://www.checkmarknetwork.com/spider.html/', @@ -3870,7 +3864,7 @@ const bots = [ producer: { name: 'Cohere, Inc.', url: 'https://cohere.com/' }, }, { - regex: 'PerplexityBot/[\\d.]+', + regex: 'PerplexityBot', name: 'PerplexityBot', category: 'Crawler', url: 'https://docs.perplexity.ai/docs/perplexitybot', @@ -3897,13 +3891,13 @@ const bots = [ producer: { name: 'Metadot, Corp.', url: 'https://www.metadot.com/' }, }, { - regex: 'Ruby, Twurly v[\\d.]+', + regex: 'Ruby, Twurly v', name: 'Twurly', category: 'Crawler', url: 'https://twurly.org/', }, { - regex: 'Mixnode(?:(?:Cache)?/[\\d.]+)?', + regex: 'Mixnode(?:Cache)?', name: 'Mixnode', category: 'Crawler', url: 'https://www.mixnode.com/', @@ -3912,9 +3906,9 @@ const bots = [ url: 'https://www.mixnode.com/', }, }, - { regex: 'CSSCheck/[\\d.]+', name: 'CSSCheck', category: 'Validator' }, + { regex: 'CSSCheck', name: 'CSSCheck', category: 'Validator' }, { - regex: 'MicrosoftPreview/[\\d.]+', + regex: 'MicrosoftPreview', name: 'Microsoft Preview', category: 'Service Agent', url: 'https://www.bing.com/webmasters/help/which-crawlers-does-bing-use-8c184ec0', @@ -3934,7 +3928,7 @@ const bots = [ }, }, { - regex: 'TinEye/[\\d.]+', + regex: 'TinEye', name: 'TinEye', category: 'Crawler', url: 'https://tineye.com/', @@ -3968,7 +3962,7 @@ const bots = [ }, }, { - regex: 'online-webceo-bot/[\\d.]+', + regex: 'online-webceo-bot', name: 'WebCEO', category: 'Crawler', url: 'https://www.webceo.com/', @@ -3988,14 +3982,14 @@ const bots = [ producer: { name: 'Vistex LTD', url: 'https://www.htmlyse.com/' }, }, { - regex: 'TrendsmapResolver/[\\d.]+', + regex: 'TrendsmapResolver', name: 'Trendsmap', category: 'Crawler', url: 'https://www.trendsmap.com/', producer: { name: 'Trendsmap Pty Ltd', url: 'https://www.trendsmap.com/' }, }, { - regex: 'Shareaholic(?:bot)?/[\\d.]+', + regex: 'Shareaholic(?:bot)?', name: 'Steve Bot', category: 'Crawler', url: 'https://www.shareaholic.com/steve', @@ -4031,19 +4025,12 @@ const bots = [ url: 'https://www.whatsmyip.org/ua/', }, { - regex: 'SenutoBot/[\\d.]+', + regex: 'SenutoBot', name: 'Senuto', category: 'Crawler', url: 'https://www.senuto.com/', producer: { name: 'Senuto Sp. z o.o.', url: 'https://www.senuto.com/' }, }, - { - regex: 'spaziodati', - name: 'SpazioDati', - category: 'Crawler', - url: 'https://www.spaziodati.eu/', - producer: { name: 'SpazioDati s.r.l.', url: 'https://www.spaziodati.eu/' }, - }, { regex: 'GozleBot', name: 'Gozle', @@ -4052,7 +4039,7 @@ const bots = [ producer: { name: 'Doly Horjun HJ', url: 'https://gozle.com.tm/' }, }, { - regex: 'Quantcastbot/[\\d.]+', + regex: 'Quantcastbot', name: 'Quantcast', category: 'Crawler', url: 'https://www.quantcast.com/bot/', @@ -4090,14 +4077,14 @@ const bots = [ }, }, { - regex: 'RuxitSynthetic/[\\d.]+', + regex: 'RuxitSynthetic', name: 'RuxitSynthetic', category: 'Site Monitor', url: 'https://community.dynatrace.com/t5/Troubleshooting/Basic-Commands-for-Synthetic/ta-p/198164', producer: { name: 'Dynatrace LLC', url: 'https://www.dynatrace.com/' }, }, { - regex: 'DynatraceSynthetic/[\\d.]+', + regex: 'DynatraceSynthetic', name: 'DynatraceSynthetic', category: 'Site Monitor', url: 'https://community.dynatrace.com/t5/Troubleshooting/Basic-Commands-for-Synthetic/ta-p/198164', @@ -4111,7 +4098,7 @@ const bots = [ producer: { name: 'Sitebulb Limited', url: 'https://sitebulb.com/' }, }, { - regex: 'Monsidobot/[\\d.]+', + regex: 'Monsidobot', name: 'Monsidobot', category: 'Crawler', url: 'https://monsido.com/bot-html', @@ -4141,7 +4128,7 @@ const bots = [ url: 'https://www.google.com/script/start/', }, { - regex: 'SiteOne-Crawler/[\\d.]+', + regex: 'SiteOne-Crawler', name: 'SiteOne Crawler', category: 'Crawler', url: 'https://crawler.siteone.io/bot/', @@ -4165,7 +4152,7 @@ const bots = [ }, }, { - regex: 'Paqlebot/[\\d.]+', + regex: 'Paqlebot', name: 'Paqlebot', category: 'Crawler', url: 'https://www.paqle.dk/about/paqlebot', @@ -4184,7 +4171,7 @@ const bots = [ url: 'https://github.com/matrix-org/synapse', }, { - regex: 'OSZKbot/[\\d.]+', + regex: 'OSZKbot', name: 'OSZKbot', category: 'Crawler', url: 'http://mekosztaly.oszk.hu/mia/', @@ -4201,7 +4188,7 @@ const bots = [ producer: { name: 'SEO Cube S.r.l.', url: 'https://www.seocube.it/' }, }, { - regex: 'RavenCrawler/[\\d.]+', + regex: 'RavenCrawler', name: 'RavenCrawler', category: 'Crawler', url: 'https://raventools.com/site-auditor/', @@ -4215,14 +4202,14 @@ const bots = [ producer: { name: 'Kadolijst', url: 'https://www.kadolijst.nl/' }, }, { - regex: 'Dubbotbot/[\\d.]+', + regex: 'Dubbotbot', name: 'Dubbotbot', category: 'Crawler', url: 'https://help.dubbot.com/en/articles/6746594-example-custom-user-agent', producer: { name: 'DubBot', url: 'https://dubbot.com/' }, }, { - regex: 'Swiftbot/[\\d.]+', + regex: 'Swiftbot', name: 'Swiftbot', category: 'Crawler', url: 'https://swiftype.com/swiftbot', @@ -4242,13 +4229,9 @@ const bots = [ url: 'https://www.thousandeyes.com/', producer: { name: 'Cisco Systems, Inc.', url: 'https://www.cisco.com/' }, }, - { regex: 'OmtrBot/[\\d.]+', name: 'OmtrBot', category: 'Site Monitor' }, - { regex: 'WebMon/[\\d.]+', name: 'WebMon', category: 'Site Monitor' }, - { - regex: 'AdsTxtCrawlerTP/[\\d.]+', - name: 'AdsTxtCrawlerTP', - category: 'Crawler', - }, + { regex: 'OmtrBot', name: 'OmtrBot', category: 'Site Monitor' }, + { regex: 'WebMon', name: 'WebMon', category: 'Site Monitor' }, + { regex: 'AdsTxtCrawlerTP', name: 'AdsTxtCrawlerTP', category: 'Crawler' }, { regex: 'fragFINN', name: 'fragFINN', @@ -4264,7 +4247,7 @@ const bots = [ producer: { name: 'Clickagy, LLC', url: 'https://www.clickagy.com/' }, }, { - regex: 'kiwitcms-gitops/[\\d.]+', + regex: 'kiwitcms-gitops', name: 'Kiwi TCMS GitOps', category: 'Service Agent', url: 'https://kiwitcms.org', @@ -4333,14 +4316,14 @@ const bots = [ }, }, { - regex: 'Uptimia(?:/[\\d.]+)?', + regex: 'Uptimia', name: 'Uptimia', category: 'Site Monitor', url: 'https://www.uptimia.com/', producer: { name: 'JJ Online GmbH', url: 'https://www.uptimia.com/' }, }, { - regex: '2GDPR/[\\d.]+', + regex: '2GDPR', name: '2GDPR', category: 'Service Agent', url: 'https://2gdpr.com/tos', @@ -4361,7 +4344,7 @@ const bots = [ producer: { name: 'CheckHost', url: 'https://check-host.net/' }, }, { - regex: 'LAC_IAHarvester/[\\d.]+', + regex: 'LAC_IAHarvester', name: 'LAC IA Harvester', category: 'Crawler', url: 'https://library-archives.canada.ca/eng/services/government-canada/web-social-media-preservation-program/Pages/web-archive.aspx', @@ -4371,7 +4354,7 @@ const bots = [ }, }, { - regex: 'InsytfulBot/[\\d.]+', + regex: 'InsytfulBot', name: 'InsytfulBot', category: 'Crawler', url: 'https://www.insytful.com/', @@ -4385,7 +4368,7 @@ const bots = [ producer: { name: 'Statista, Inc.', url: 'https://www.statista.com/' }, }, { - regex: 'SubstackContentFetch/[\\d.]+', + regex: 'SubstackContentFetch', name: 'Substack Content Fetch', category: 'Crawler', url: 'https://substack.com/', @@ -4416,7 +4399,7 @@ const bots = [ producer: { name: 'Tenable, Inc.', url: 'https://www.tenable.com/' }, }, { - regex: 'Castopod/[\\d.]+', + regex: 'Castopod', name: 'Castopod', category: 'Crawler', url: 'https://www.castopod.org/', @@ -4429,7 +4412,7 @@ const bots = [ producer: { name: 'Elasticsearch B.V.', url: 'https://www.elastic.co/' }, }, { - regex: 'WDG_Validator/[\\d.]+', + regex: 'WDG_Validator', name: 'WDG HTML Validator', category: 'Validator', url: 'http://www.htmlhelp.com/tools/validator/', @@ -4441,7 +4424,7 @@ const bots = [ url: 'https://web.archive.org/web/20180910002802/http://www.aegis.network/', }, { - regex: 'CrawlyProjectCrawler/[\\d.]+', + regex: 'CrawlyProjectCrawler', name: 'Crawly Project', category: 'Crawler', url: 'https://web.archive.org/web/20240326141952/https://crawlyproject.digitaldragon.dev/', @@ -4459,20 +4442,20 @@ const bots = [ url: 'https://github.com/openeasm/punkmap', }, { - regex: 'GenomeCrawlerd/[\\d.]+', + regex: 'GenomeCrawlerd', name: 'Deepfield Genome', category: 'Crawler', url: 'https://www.nokia.com/networks/ip-networks/deepfield/genome/', producer: { name: 'Nokia Corporation', url: 'https://www.nokia.com/' }, }, { - regex: 'Gaisbot/[\\d.]+', + regex: 'Gaisbot', name: 'Gaisbot', category: 'Crawler', url: 'https://web.archive.org/web/20090604121511/https://gais.cs.ccu.edu.tw/robot.php', }, { - regex: 'FAST-WebCrawler/[\\d.]+', + regex: 'FAST-WebCrawler', name: 'AlltheWeb', category: 'Crawler', url: 'https://web.archive.org/web/20041020050801/http://www.alltheweb.com/help/webmaster/crawler', @@ -4484,7 +4467,7 @@ const bots = [ url: 'https://ducks.party/', }, { - regex: 'DepSpid/[\\d.]+', + regex: 'DepSpid', name: 'DepSpid', category: 'Crawler', url: 'https://web.archive.org/web/20080321224033/http://about.depspid.net/', @@ -4589,7 +4572,7 @@ const bots = [ producer: { name: 'Marginalia', url: 'https://www.marginalia.nu/' }, }, { - regex: 'vu-server-health-scanner/[\\d.]+', + regex: 'vu-server-health-scanner', name: 'VU Server Health Scanner', category: 'Security Checker', url: 'https://130.37.198.75/index.html', @@ -4749,9 +4732,150 @@ const bots = [ category: 'Crawler', url: 'https://github.com/nettrom/suggestbot', }, + { + regex: 'cms-experiment', + name: 'CMS Experiment', + category: 'Security Checker', + url: 'https://securitee.org/cms-experiment-fall2024/', + }, + { + regex: 'SiteCheckerBotCrawler', + name: 'SiteCheckerBotCrawler', + category: 'Crawler', + url: 'https://sitechecker.pro/', + producer: { name: 'Cyber Circus Limited', url: 'https://sitechecker.pro/' }, + }, + { + regex: 'SBIder', + name: 'SBIder', + category: 'Crawler', + url: 'https://www.sitesell.com/sbider.html', + producer: { name: 'SiteSell Inc.', url: 'https://www.sitesell.com/' }, + }, + { + regex: 'LightspeedSystemsCrawler', + name: 'LightspeedSystemsCrawler', + category: 'Crawler', + url: 'https://www.lightspeedsystems.com/', + producer: { + name: 'Lightspeed Systems, Inc.', + url: 'https://www.lightspeedsystems.com/', + }, + }, + { + regex: 'Research JLU', + name: 'Research JLU', + category: 'Crawler', + url: 'https://www.uni-giessen.de/en/research', + producer: { + name: 'Justus Liebig University Giessen', + url: 'https://www.uni-giessen.de/en', + }, + }, + { + regex: '(?:hgf|OS)AlphaXCrawl', + name: 'AlphaXCrawl', + category: 'Crawler', + url: 'https://www.fim.uni-passau.de/en/data-science/research/open-search', + producer: { + name: 'University of Passau', + url: 'https://www.uni-passau.de/en/', + }, + }, + { + regex: 'WPMU DEV', + name: 'WPMU DEV', + category: 'Crawler', + url: 'https://wpmudev.com/docs/wpmu-dev-plugins/broken-link-checker/#broken-link-checker-user-agent', + producer: { name: 'Incsub, LLC.', url: 'https://incsub.com/' }, + }, + { + regex: 'SnoopSecInspect', + name: 'SnoopSecInspect', + category: 'Security Checker', + url: 'https://web.archive.org/web/20241206193253/https://snoopsec.us.to/', + }, + { + regex: 'ModatScanner', + name: 'ModatScanner', + category: 'Security Checker', + url: 'https://www.modat.io/scanning', + producer: { name: 'Modat B.V.', url: 'https://www.modat.io/' }, + }, + { + regex: 'researchcyber\\.net', + name: 'researchcyber.net', + category: 'Security Checker', + url: 'https://web.archive.org/web/20241219082407/https://researchcyber.net/', + }, + { + regex: 'CrystalSemanticsBot', + name: 'CrystalSemanticsBot', + category: 'Crawler', + url: 'https://web.archive.org/web/20121230203310/http://www.crystalsemantics.com/user-agent/', + producer: { + name: 'Crystal Semantics Ltd.', + url: 'https://web.archive.org/web/20121029062239/http://www.crystalsemantics.com/', + }, + }, + { + regex: 'najdu\\.s\\.holubem\\.eu', + name: 'najdu.s.holubem.eu', + category: 'Crawler', + url: 'https://najdu.s.holubem.eu/', + }, + { + regex: 'VORTEX/', + name: 'VORTEX', + category: 'Crawler', + url: 'https://marty.anstey.ca/robots/vortex', + }, + { + regex: 'xtate/(\\d+\\.[.\\d]+)', + name: 'xtate', + category: 'Crawler', + url: 'https://github.com/babycoff/xtate', + }, + { + regex: 'FediList Agent/', + name: 'FediList', + category: 'Social Media Agent', + url: 'https://fedilist.com/', + }, + { + regex: 'Grafana/(\\d+\\.[.\\d]+)', + name: 'Grafana', + category: 'Site Monitor', + url: 'https://github.com/grafana/grafana', + producer: { name: 'Grafana Labs', url: 'https://grafana.com/' }, + }, + { + regex: 'github-camo', + name: 'Github Camo', + category: 'Crawler', + url: 'https://github.com/atmos/camo', + producer: { + name: 'Github', + url: 'https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/about-anonymized-urls', + }, + }, + { + regex: 'Bluesky', + name: 'Bluesky', + category: 'Social Media Agent', + url: 'https://bsky.app', + producer: { name: 'Bluesky Social PBC', url: 'https://bsky.app' }, + }, + { + regex: 'OpenGraph\\.io', + name: 'OpenGraph.io', + category: 'Crawler', + url: 'https://www.opengraph.io', + producer: { name: 'OpenGraph.io', url: 'https://www.opengraph.io' }, + }, { regex: - 'nuhk|grub-client|Download Demon|SearchExpress|Microsoft URL Control|borg|altavista|dataminr\\.com|teoma|oegp|http%20client|htdig|mogimogi|larbin|scrubby|searchsight|semanticdiscovery|snappy|vortex(?!(?: Build|Plus| CM62| HD65))|zeal(?!ot)|dataparksearch|findlinks|BrowserMob|URL2PNG|ZooShot|GomezA|Google SketchUp|Read%20Later|7Siters|centuryb\\.o\\.t9|InterNaetBoten|EasyBib AutoCite|Bidtellect|tomnomnom/meg|cortex|Re-re Studio|adreview|AHC/|NameOfAgent|Request-Promise|ALittle Client|Hello,? world|wp_is_mobile|0xAbyssalDoesntExist|Anarchy99|^revolt|nvd0rz|xfa1|Hakai|gbrmss|fuck-your-hp|IDBTE4M CODE87|Antoine|Insomania|Hells-Net|b3astmode|Linux Gnu \\(cow\\)|Test Certificate Info|iplabel|Magellan|TheSafex?Internetx?Search|Searcherx?web|kirkland-signature|LinkChain|survey-security-dot-txt|infrawatch|Time/|r00ts3c-owned-you|nvdorz|Root Slut|NiggaBalls|BotPoke|GlobalWebSearch|xx032_bo9vs83_2a|sslshed|geckotrail|Wordup|Keydrop|^xenu|^(?:chrome|firefox|Abcd|Dark|KvshClient|Node.js|Report Runner|url|Zeus|ZmEu)$', + 'nuhk|grub-client|Download Demon|SearchExpress|Microsoft URL Control|borg|altavista|dataminr\\.com|teoma|oegp|http%20client|htdig|mogimogi|larbin|scrubby|searchsight|semanticdiscovery|snappy|zeal(?!ot)|dataparksearch|findlinks|BrowserMob|URL2PNG|ZooShot|GomezA|Google SketchUp|Read%20Later|7Siters|centuryb\\.o\\.t9|InterNaetBoten|EasyBib AutoCite|Bidtellect|tomnomnom/meg|cortex|Re-re Studio|adreview|AHC/|NameOfAgent|Request-Promise|ALittle Client|Hello,? world|wp_is_mobile|0xAbyssalDoesntExist|Anarchy99|^revolt|nvd0rz|xfa1|Hakai|gbrmss|fuck-your-hp|IDBTE4M CODE87|Antoine|Insomania|Hells-Net|b3astmode|Linux Gnu \\(cow\\)|Test Certificate Info|iplabel|Magellan|TheSafex?Internetx?Search|Searcherx?web|kirkland-signature|LinkChain|survey-security-dot-txt|infrawatch|Time/|r00ts3c-owned-you|nvdorz|Root Slut|NiggaBalls|BotPoke|GlobalWebSearch|xx032_bo9vs83_2a|sslshed|geckotrail|Wordup|Keydrop|\\(compatible\\)|John Recon|SPARK COMMIT|masjesu|Komaru_The_Cat|Jesus Christ of Nazareth is LORD|Kowai|Hakai|LoliSec|LMAO|^xenu|^(?:chrome|firefox|Abcd|Dark|KvshClient|Node.js|Report Runner|url|Zeus|ZmEu)$|OnlyScans|TheInternetSearchx', name: 'Generic Bot', }, { diff --git a/apps/worker/src/referrers/index.ts b/apps/worker/src/referrers/index.ts index 6d1734d3..bb1de3d1 100644 --- a/apps/worker/src/referrers/index.ts +++ b/apps/worker/src/referrers/index.ts @@ -55,7 +55,7 @@ const referrers: Record = { 'lo.st': { type: 'search', name: 'Lo.st' }, 'www1.dastelefonbuch.de': { type: 'search', name: 'DasTelefonbuch' }, 'www.fireball.de': { type: 'search', name: 'Fireball' }, - 'search.1und1.de': { type: 'search', name: '1und1' }, + 'suche.1und1.de': { type: 'search', name: '1und1' }, 'ricerca.virgilio.it': { type: 'search', name: 'Virgilio' }, 'ricercaimmagini.virgilio.it': { type: 'search', name: 'Virgilio' }, 'ricercavideo.virgilio.it': { type: 'search', name: 'Virgilio' }, @@ -1162,6 +1162,7 @@ const referrers: Record = { 'clusty.com': { type: 'search', name: 'InfoSpace' }, 'www.weborama.com': { type: 'search', name: 'Weborama' }, 'search.bluewin.ch': { type: 'search', name: 'Bluewin' }, + 'search.brave.com': { type: 'search', name: 'Brave' }, 'search.bt.com': { type: 'search', name: 'British Telecommunications' }, 'www.neti.ee': { type: 'search', name: 'Neti' }, 'nigma.ru': { type: 'search', name: 'Nigma' }, @@ -1206,6 +1207,8 @@ const referrers: Record = { 'www.toile.com': { type: 'search', name: 'La Toile Du Quebec Via Google' }, 'web.toile.com': { type: 'search', name: 'La Toile Du Quebec Via Google' }, 'www.paperball.de': { type: 'search', name: 'Paperball' }, + 'arianna.libero.it': { type: 'search', name: 'Arianna' }, + 'www.arianna.com': { type: 'search', name: 'Arianna' }, 'www.stepstone.de': { type: 'search', name: 'StepStone' }, 'www.stepstone.at': { type: 'search', name: 'StepStone' }, 'www.stepstone.be': { type: 'search', name: 'StepStone' }, @@ -1496,7 +1499,7 @@ const referrers: Record = { 'www.walhello.com': { type: 'search', name: 'Walhello' }, 'www.walhello.de': { type: 'search', name: 'Walhello' }, 'www.walhello.nl': { type: 'search', name: 'Walhello' }, - 'meta.ua': { type: 'search', name: 'Meta' }, + 'www.startsiden.no': { type: 'search', name: 'Startsiden' }, 'www.skynet.be': { type: 'search', name: 'Skynet' }, 'www.searchy.co.uk': { type: 'search', name: 'Searchy' }, 'search.findwide.com': { type: 'search', name: 'Findwide' }, @@ -1518,7 +1521,6 @@ const referrers: Record = { 'search.lilo.org': { type: 'search', name: 'Lilo' }, 'search.naver.com': { type: 'search', name: 'Naver' }, 'www.zoeken.nl': { type: 'search', name: 'Zoeken' }, - 'www.startsiden.no': { type: 'search', name: 'Startsiden' }, 'search.yam.com': { type: 'search', name: 'Yam' }, 'www.eniro.se': { type: 'search', name: 'Eniro' }, 'apollo7.de': { type: 'search', name: 'APOLL07' }, @@ -1569,6 +1571,7 @@ const referrers: Record = { 'suche.gmx.net': { type: 'search', name: 'GMX' }, 'daemon-search.com': { type: 'search', name: 'Daemon search' }, 'my.daemon-search.com': { type: 'search', name: 'Daemon search' }, + 'meta.ua': { type: 'search', name: 'Meta.ua' }, 'so.m.sm.cn': { type: 'search', name: 'Shenma' }, 'yz.m.sm.cn': { type: 'search', name: 'Shenma' }, 'm.sm.cn': { type: 'search', name: 'Shenma' }, @@ -1938,8 +1941,7 @@ const referrers: Record = { 'jyxo.1188.cz': { type: 'search', name: 'Jyxo' }, 'www.kataweb.it': { type: 'search', name: 'Kataweb' }, 'busca.uol.com.br': { type: 'search', name: 'uol.com.br' }, - 'arianna.libero.it': { type: 'search', name: 'Arianna' }, - 'www.arianna.com': { type: 'search', name: 'Arianna' }, + 'websearch.rakuten.co.jp': { type: 'search', name: 'Rakuten' }, 'www.mamma.com': { type: 'search', name: 'Mamma' }, 'mamma75.mamma.com': { type: 'search', name: 'Mamma' }, 'www.yatedo.com': { type: 'search', name: 'Yatedo' }, @@ -1947,7 +1949,6 @@ const referrers: Record = { 'www.twingly.com': { type: 'search', name: 'Twingly' }, 'smart.delfi.lv': { type: 'search', name: 'Delfi latvia' }, 'www.pricerunner.co.uk': { type: 'search', name: 'PriceRunner' }, - 'websearch.rakuten.co.jp': { type: 'search', name: 'Rakuten' }, 'www.google.com': { type: 'search', name: 'Google' }, 'www.google.ac': { type: 'search', name: 'Google' }, 'www.google.ad': { type: 'search', name: 'Google' }, @@ -2395,8 +2396,10 @@ const referrers: Record = { 'email.telstra.com': { type: 'email', name: 'Bigpond' }, 'basic.messaging.bigpond.com': { type: 'email', name: 'Bigpond' }, 'mail.naver.com': { type: 'email', name: 'Naver Mail' }, + 'email.t-online.de': { type: 'email', name: 'T-online' }, 'mail.zoho.com': { type: 'email', name: 'Zoho' }, - 'webmail.virginbroadband.com.au': { type: 'email', name: 'Virgin' }, + 'mail.163.com': { type: 'email', name: '163 Mail' }, + 'webmail.tim.it': { type: 'email', name: 'TIM' }, 'mail.yahoo.net': { type: 'email', name: 'Yahoo! Mail' }, 'mail.yahoo.com': { type: 'email', name: 'Yahoo! Mail' }, 'mail.yahoo.co.uk': { type: 'email', name: 'Yahoo! Mail' }, @@ -2409,10 +2412,18 @@ const referrers: Record = { 'mail.iinet.net.au': { type: 'email', name: 'iiNet' }, 'mail.e1.ru': { type: 'email', name: 'E1.ru' }, 'webmail.vodafone.co.nz': { type: 'email', name: 'Vodafone' }, + 'mail.vodafone.de': { type: 'email', name: 'Vodafone' }, + 'deref-1und1-02.de': { type: 'email', name: '1und1' }, + 'webmail.dodo.com.au': { type: 'email', name: 'Dodo' }, 'mail.126.com': { type: 'email', name: '126 Mail' }, 'com.mailchimp.mailchimp': { type: 'email', name: 'Mailchimp' }, 'inbox.com': { type: 'email', name: 'Inbox.com' }, 'webmail.iprimus.com.au': { type: 'email', name: 'iPrimus' }, + 'deref-web.de': { type: 'email', name: 'Web.de' }, + '3c.web.de': { type: 'email', name: 'Web.de' }, + '3c-bap.web.de': { type: 'email', name: 'Web.de' }, + 'lightmailer-bap.web.de': { type: 'email', name: 'Web.de' }, + 'lightmailer-bs.web.de': { type: 'email', name: 'Web.de' }, 'mail.qq.com': { type: 'email', name: 'QQ Mail' }, 'exmail.qq.com': { type: 'email', name: 'QQ Mail' }, 'mail.qip.ru': { type: 'email', name: 'QIP' }, @@ -2423,30 +2434,79 @@ const referrers: Record = { 'mail.live.com': { type: 'email', name: 'Outlook.com' }, 'outlook.live.com': { type: 'email', name: 'Outlook.com' }, 'com.microsoft.office.outlook': { type: 'email', name: 'Outlook.com' }, - 'webmail.dodo.com.au': { type: 'email', name: 'Dodo' }, - 'webmail.2degreesbroadband.co.nz': { type: 'email', name: '2degrees' }, + 'deref-mail.com': { type: 'email', name: 'Mail.com' }, + '3c-lxa.mail.com': { type: 'email', name: 'Mail.com' }, + 'lightmailer.mail.com': { type: 'email', name: 'Mail.com' }, 'mail2.daum.net': { type: 'email', name: 'Daum Mail' }, 'mail.daum.net': { type: 'email', name: 'Daum Mail' }, + 'upcmail.hispeed.ch': { type: 'email', name: 'UPC' }, + 'webmail.2degreesbroadband.co.nz': { type: 'email', name: '2degrees' }, 'post.ru': { type: 'email', name: 'Beeline' }, - 'e.mail.ru': { type: 'email', name: 'Mail.ru' }, - 'touch.mail.ru': { type: 'email', name: 'Mail.ru' }, + 'mail.infomaniak.com': { type: 'email', name: 'Infomaniak' }, 'webmail.adam.com.au': { type: 'email', name: 'Adam Internet' }, 'orange.fr/webmail': { type: 'email', name: 'Orange Webmail' }, + 'mail01.orange.fr': { type: 'email', name: 'Orange Webmail' }, + 'mail02.orange.fr': { type: 'email', name: 'Orange Webmail' }, + 'wmail.orange.fr': { type: 'email', name: 'Orange Webmail' }, + 'messageriepro3.orange.fr': { type: 'email', name: 'Orange Webmail' }, + 'messagerie.orange.fr': { type: 'email', name: 'Orange Webmail' }, + 'email.ionos.de': { type: 'email', name: 'Ionos' }, + 'email.ionos.es': { type: 'email', name: 'Ionos' }, + 'email.ionos.fr': { type: 'email', name: 'Ionos' }, + 'email.ionos.it': { type: 'email', name: 'Ionos' }, + 'email.ionos.ca': { type: 'email', name: 'Ionos' }, + 'email.ionos.mx': { type: 'email', name: 'Ionos' }, + 'email.ionos.com': { type: 'email', name: 'Ionos' }, + 'email.ionos.co.uk': { type: 'email', name: 'Ionos' }, + 'mailbusiness.ionos.de': { type: 'email', name: 'Ionos' }, + 'mailbusiness.ionos.es': { type: 'email', name: 'Ionos' }, + 'mailbusiness.ionos.fr': { type: 'email', name: 'Ionos' }, + 'mailbusiness.ionos.it': { type: 'email', name: 'Ionos' }, + 'mailbusiness.ionos.ca': { type: 'email', name: 'Ionos' }, + 'mailbusiness.ionos.mx': { type: 'email', name: 'Ionos' }, + 'mailbusiness.ionos.com': { type: 'email', name: 'Ionos' }, + 'mailbusiness.ionos.co.uk': { type: 'email', name: 'Ionos' }, 'com.earthlink.myearthlink': { type: 'email', name: 'earthlink' }, + 'rich-v01.bluewin.ch': { type: 'email', name: 'Bluewin' }, + 'rich-v02.bluewin.ch': { type: 'email', name: 'Bluewin' }, + 'email.bluewin.ch': { type: 'email', name: 'Bluewin' }, 'mail.aol.com': { type: 'email', name: 'AOL Mail' }, 'com.aol.mobile.aolapp': { type: 'email', name: 'AOL Mail' }, 'webmail.netspace.net.au': { type: 'email', name: 'Netspace' }, 'webmail.optuszoo.com.au': { type: 'email', name: 'Optus Zoo' }, 'webmail.optusnet.com.au': { type: 'email', name: 'Optus Zoo' }, + 'webmail.virginbroadband.com.au': { type: 'email', name: 'Virgin' }, + 'mail.proton.me': { type: 'email', name: 'Proton' }, 'webmail.commander.net.au': { type: 'email', name: 'Commander' }, 'mastermail.ru': { type: 'email', name: 'Mastermail' }, 'm.mastermail.ru': { type: 'email', name: 'Mastermail' }, + 'deref-gmx.de': { type: 'email', name: 'GMX' }, + 'deref-gmx.at': { type: 'email', name: 'GMX' }, + 'deref-gmx.ch': { type: 'email', name: 'GMX' }, + 'deref-gmx.fr': { type: 'email', name: 'GMX' }, + 'deref-gmx.es': { type: 'email', name: 'GMX' }, + 'deref-gmx.it': { type: 'email', name: 'GMX' }, + 'deref-gmx.com': { type: 'email', name: 'GMX' }, + 'deref-gmx.net': { type: 'email', name: 'GMX' }, + 'deref-gmx.co.uk': { type: 'email', name: 'GMX' }, + 'lightmailer.gmx.de': { type: 'email', name: 'GMX' }, + 'lightmailer.gmx.at': { type: 'email', name: 'GMX' }, + 'lightmailer.gmx.ch': { type: 'email', name: 'GMX' }, + 'lightmailer.gmx.fr': { type: 'email', name: 'GMX' }, + 'lightmailer.gmx.es': { type: 'email', name: 'GMX' }, + 'lightmailer.gmx.it': { type: 'email', name: 'GMX' }, + 'lightmailer.gmx.com': { type: 'email', name: 'GMX' }, + 'lightmailer.gmx.net': { type: 'email', name: 'GMX' }, + 'lightmailer.gmx.co.uk': { type: 'email', name: 'GMX' }, + 'lightmailer-bs.gmx.net': { type: 'email', name: 'GMX' }, + 'lightmailer-bap.gmx.net': { type: 'email', name: 'GMX' }, 'mail.yandex.ru': { type: 'email', name: 'Yandex' }, 'mail.yandex.com': { type: 'email', name: 'Yandex' }, 'mail.yandex.kz': { type: 'email', name: 'Yandex' }, 'mail.yandex.ua': { type: 'email', name: 'Yandex' }, 'mail.yandex.by': { type: 'email', name: 'Yandex' }, - 'mail.163.com': { type: 'email', name: '163 Mail' }, + 'e.mail.ru': { type: 'email', name: 'Mail.ru' }, + 'touch.mail.ru': { type: 'email', name: 'Mail.ru' }, 'mail.ukr.net': { type: 'email', name: 'Ukr.net' }, 'mail.rambler.ru': { type: 'email', name: 'Rambler' }, 'mail.mynet.com': { type: 'email', name: 'Mynet Mail' }, @@ -2509,9 +2569,14 @@ const referrers: Record = { 'www.googleadservices.com': { type: 'paid', name: 'Google' }, 'partner.googleadservices.com': { type: 'paid', name: 'Google' }, 'googleads.g.doubleclick.net': { type: 'paid', name: 'Google' }, + 'tdsf.doubleclick.net': { type: 'paid', name: 'Google' }, 'tpc.googlesyndication.com': { type: 'paid', name: 'Google' }, + 'safeframe.googlesyndication.com': { type: 'paid', name: 'Google' }, 'googleadservices.com': { type: 'paid', name: 'Google' }, 'imasdk.googleapis.com': { type: 'paid', name: 'Google' }, + 'www.adsensecustomsearchads.com': { type: 'paid', name: 'Google' }, + 'syndicatedsearch.goog': { type: 'paid', name: 'Google' }, + 'pagead2.googlesyndication.com': { type: 'paid', name: 'Google' }, 'eyeota.net': { type: 'paid', name: 'Eyeota' }, 'price.ru': { type: 'paid', name: 'Price.ru' }, 'v.price.ru': { type: 'paid', name: 'Price.ru' }, @@ -2544,8 +2609,7 @@ const referrers: Record = { 'sonico.com': { type: 'social', name: 'Sonico.com' }, 'odnoklassniki.ru': { type: 'social', name: 'Odnoklassniki' }, 'ok.ru': { type: 'social', name: 'Odnoklassniki' }, - 'tildes.net': { type: 'social', name: 'Tildes' }, - 'com.talklittle.android.tildes': { type: 'social', name: 'Tildes' }, + 'github.com': { type: 'tech', name: 'GitHub' }, 'classmates.com': { type: 'social', name: 'Classmates' }, 'friendsreunited.com': { type: 'social', name: 'Friends Reunited' }, 'news.ycombinator.com': { type: 'social', name: 'Hacker News' }, @@ -2556,9 +2620,12 @@ const referrers: Record = { 'orkut.com': { type: 'social', name: 'Orkut' }, 'myheritage.com': { type: 'social', name: 'MyHeritage' }, 'multiply.com': { type: 'social', name: 'Multiply' }, - 'threads.net': { type: 'social', name: 'Threads' }, - 'l.threads.net': { type: 'social', name: 'Threads' }, - 'com.instagram.barcelona': { type: 'social', name: 'Threads' }, + 'facebook.com': { type: 'social', name: 'Facebook' }, + 'fb.me': { type: 'social', name: 'Facebook' }, + 'm.facebook.com': { type: 'social', name: 'Facebook' }, + 'l.facebook.com': { type: 'social', name: 'Facebook' }, + 'lm.facebook.com': { type: 'social', name: 'Facebook' }, + 'com.facebook.katana': { type: 'social', name: 'Facebook' }, 'myyearbook.com': { type: 'social', name: 'myYearbook' }, 'renren.com': { type: 'social', name: 'Renren' }, 'app.slack.com': { type: 'social', name: 'Slack' }, @@ -2632,15 +2699,13 @@ const referrers: Record = { 'douban.com': { type: 'social', name: 'Douban' }, 'login.live.com': { type: 'social', name: 'Windows Live Spaces' }, 'blackplanet.com': { type: 'social', name: 'BlackPlanet' }, + 'lnk.bio': { type: 'social', name: 'Lnk.Bio' }, 'global.cyworld.com': { type: 'social', name: 'Cyworld' }, 'getpocket.com': { type: 'social', name: 'Pocket' }, 'skyrock.com': { type: 'social', name: 'Skyrock' }, - 'facebook.com': { type: 'social', name: 'Facebook' }, - 'fb.me': { type: 'social', name: 'Facebook' }, - 'm.facebook.com': { type: 'social', name: 'Facebook' }, - 'l.facebook.com': { type: 'social', name: 'Facebook' }, - 'lm.facebook.com': { type: 'social', name: 'Facebook' }, - 'com.facebook.katana': { type: 'social', name: 'Facebook' }, + 'threads.net': { type: 'social', name: 'Threads' }, + 'l.threads.net': { type: 'social', name: 'Threads' }, + 'com.instagram.barcelona': { type: 'social', name: 'Threads' }, 'web.whatsapp.com': { type: 'social', name: 'WhatsApp' }, 'com.whatsapp': { type: 'social', name: 'WhatsApp' }, 'redirect.disqus.com': { type: 'social', name: 'Disqus' }, @@ -2663,7 +2728,10 @@ const referrers: Record = { 'com.laurencedawson.reddit_sync': { type: 'social', name: 'Reddit' }, 'com.laurencedawson.reddit_sync.pro': { type: 'social', name: 'Reddit' }, 'viadeo.com': { type: 'social', name: 'Viadeo' }, - 'github.com': { type: 'tech', name: 'GitHub' }, + 'tildes.net': { type: 'social', name: 'Tildes' }, + 'com.talklittle.android.tildes': { type: 'social', name: 'Tildes' }, + 'l.workplace.com': { type: 'social', name: 'Workplace' }, + 'lm.workplace.com': { type: 'social', name: 'Workplace' }, 'stackoverflow.com': { type: 'tech', name: 'Stack Overflow' }, 'gaiaonline.com': { type: 'social', name: 'Gaia Online' }, 'stumbleupon.com': { type: 'social', name: 'StumbleUpon' }, diff --git a/package.json b/package.json index f6e226de..8ce7dade 100644 --- a/package.json +++ b/package.json @@ -7,6 +7,8 @@ "packageManager": "pnpm@9.15.0", "scripts": { "test": "vitest", + "gen:bots": "pnpm -r --filter api gen:bots", + "gen:referrers": "pnpm -r --filter worker gen:referrers", "dock:up": "docker compose up -d", "dock:down": "docker compose down", "dock:ch": "docker compose exec -it op-ch clickhouse-client -d openpanel", From f61b66cb5cc9d9c5044f98e207df2d55c236364c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Carl-Gerhard=20Lindesva=CC=88rd?= Date: Tue, 22 Jul 2025 21:45:10 +0200 Subject: [PATCH 2/2] fix: handle cookie domain better --- packages/auth/constants.ts | 9 +- packages/auth/parse-cookie-domain.test.ts | 402 ++++++++++++++++++++++ packages/auth/parse-cookie-domain.ts | 124 +++++++ 3 files changed, 527 insertions(+), 8 deletions(-) create mode 100644 packages/auth/parse-cookie-domain.test.ts create mode 100644 packages/auth/parse-cookie-domain.ts diff --git a/packages/auth/constants.ts b/packages/auth/constants.ts index d344d656..fc549e91 100644 --- a/packages/auth/constants.ts +++ b/packages/auth/constants.ts @@ -1,11 +1,4 @@ -// Sorry co.uk, but you're not a top domain -const parseCookieDomain = (url: string) => { - const domain = new URL(url); - return { - domain: domain.hostname.split('.').slice(-2).join('.'), - secure: domain.protocol === 'https:', - }; -}; +import { parseCookieDomain } from './parse-cookie-domain'; const parsed = parseCookieDomain(process.env.NEXT_PUBLIC_DASHBOARD_URL ?? ''); diff --git a/packages/auth/parse-cookie-domain.test.ts b/packages/auth/parse-cookie-domain.test.ts new file mode 100644 index 00000000..ce746899 --- /dev/null +++ b/packages/auth/parse-cookie-domain.test.ts @@ -0,0 +1,402 @@ +import { describe, expect, it } from 'vitest'; +import { parseCookieDomain } from './parse-cookie-domain'; + +describe('parseCookieDomain', () => { + it('should return undefined domain for empty string', () => { + expect(parseCookieDomain('')).toEqual({ + domain: undefined, + secure: false, + }); + }); + + describe('localhost and IP addresses', () => { + it('should return undefined domain for localhost', () => { + expect(parseCookieDomain('http://localhost:3000')).toEqual({ + domain: undefined, + secure: false, + }); + }); + + it('should return undefined domain for localhost with https', () => { + expect(parseCookieDomain('https://localhost:3000')).toEqual({ + domain: undefined, + secure: true, + }); + }); + + it('should return undefined domain for IPv4 addresses', () => { + expect(parseCookieDomain('http://192.168.1.1')).toEqual({ + domain: undefined, + secure: false, + }); + }); + + it('should return undefined domain for IPv4 addresses with https', () => { + expect(parseCookieDomain('https://192.168.1.1')).toEqual({ + domain: undefined, + secure: true, + }); + }); + + it('should return undefined domain for IPv4 addresses with port', () => { + expect(parseCookieDomain('http://192.168.1.1:8080')).toEqual({ + domain: undefined, + secure: false, + }); + }); + }); + + describe('multi-part TLDs (co.uk, com.au, etc.)', () => { + it('should handle co.uk domains correctly', () => { + expect(parseCookieDomain('https://example.co.uk')).toEqual({ + domain: '.example.co.uk', + secure: true, + }); + }); + + it('should handle subdomains of co.uk domains', () => { + expect(parseCookieDomain('https://subdomain.example.co.uk')).toEqual({ + domain: '.example.co.uk', + secure: true, + }); + }); + + it('should handle deep subdomains of co.uk domains', () => { + expect(parseCookieDomain('https://api.subdomain.example.co.uk')).toEqual({ + domain: '.example.co.uk', + secure: true, + }); + }); + + it('should handle com.au domains correctly', () => { + expect(parseCookieDomain('https://example.com.au')).toEqual({ + domain: '.example.com.au', + secure: true, + }); + }); + + it('should handle subdomains of com.au domains', () => { + expect(parseCookieDomain('https://api.example.com.au')).toEqual({ + domain: '.example.com.au', + secure: true, + }); + }); + + it('should handle co.za domains correctly', () => { + expect(parseCookieDomain('https://example.co.za')).toEqual({ + domain: '.example.co.za', + secure: true, + }); + }); + + it('should handle org.uk domains correctly', () => { + expect(parseCookieDomain('https://example.org.uk')).toEqual({ + domain: '.example.org.uk', + secure: true, + }); + }); + + it('should handle gov.uk domains correctly', () => { + expect(parseCookieDomain('https://example.gov.uk')).toEqual({ + domain: '.example.gov.uk', + secure: true, + }); + }); + + it('should handle ac.uk domains correctly', () => { + expect(parseCookieDomain('https://example.ac.uk')).toEqual({ + domain: '.example.ac.uk', + secure: true, + }); + }); + + it('should handle nhs.uk domains correctly', () => { + expect(parseCookieDomain('https://example.nhs.uk')).toEqual({ + domain: '.example.nhs.uk', + secure: true, + }); + }); + }); + + describe('regular domains', () => { + it('should handle root domains correctly', () => { + expect(parseCookieDomain('https://example.com')).toEqual({ + domain: '.example.com', + secure: true, + }); + }); + + it('should handle root domains with http', () => { + expect(parseCookieDomain('http://example.com')).toEqual({ + domain: '.example.com', + secure: false, + }); + }); + + it('should handle subdomains correctly', () => { + expect(parseCookieDomain('https://api.example.com')).toEqual({ + domain: '.example.com', + secure: true, + }); + }); + + it('should handle deep subdomains correctly', () => { + expect(parseCookieDomain('https://v1.api.example.com')).toEqual({ + domain: '.example.com', + secure: true, + }); + }); + + it('should handle very deep subdomains correctly', () => { + expect(parseCookieDomain('https://staging.v1.api.example.com')).toEqual({ + domain: '.example.com', + secure: true, + }); + }); + }); + + describe('PaaS platform subdomains', () => { + it('should handle zeabur.app subdomains correctly', () => { + expect(parseCookieDomain('https://xxx.zeabur.app')).toEqual({ + domain: '.zeabur.app', + secure: true, + }); + }); + + it('should handle railway.app subdomains correctly', () => { + expect(parseCookieDomain('https://xxx.railway.app')).toEqual({ + domain: '.railway.app', + secure: true, + }); + }); + + it('should handle vercel.app subdomains correctly', () => { + expect(parseCookieDomain('https://xxx.vercel.app')).toEqual({ + domain: '.vercel.app', + secure: true, + }); + }); + + it('should handle netlify.app subdomains correctly', () => { + expect(parseCookieDomain('https://xxx.netlify.app')).toEqual({ + domain: '.netlify.app', + secure: true, + }); + }); + + it('should handle render.com subdomains correctly', () => { + expect(parseCookieDomain('https://xxx.onrender.com')).toEqual({ + domain: '.onrender.com', + secure: true, + }); + }); + }); + + describe('edge cases and potential breaking scenarios', () => { + it('should handle domains with ports', () => { + expect(parseCookieDomain('https://example.com:8080')).toEqual({ + domain: '.example.com', + secure: true, + }); + }); + + it('should handle domains with paths', () => { + expect(parseCookieDomain('https://example.com/path')).toEqual({ + domain: '.example.com', + secure: true, + }); + }); + + it('should handle domains with query parameters', () => { + expect(parseCookieDomain('https://example.com?param=value')).toEqual({ + domain: '.example.com', + secure: true, + }); + }); + + it('should handle domains with fragments', () => { + expect(parseCookieDomain('https://example.com#fragment')).toEqual({ + domain: '.example.com', + secure: true, + }); + }); + + it('should handle domains with all URL components', () => { + expect( + parseCookieDomain('https://example.com:8080/path?param=value#fragment'), + ).toEqual({ + domain: '.example.com', + secure: true, + }); + }); + + it('should handle single-level domains', () => { + expect(parseCookieDomain('https://example')).toEqual({ + domain: '.example', + secure: true, + }); + }); + + it('should handle domains with underscores (invalid but should not crash)', () => { + expect(parseCookieDomain('https://example_test.com')).toEqual({ + domain: '.example_test.com', + secure: true, + }); + }); + + it('should handle domains with hyphens', () => { + expect(parseCookieDomain('https://example-test.com')).toEqual({ + domain: '.example-test.com', + secure: true, + }); + }); + + it('should handle domains with numbers', () => { + expect(parseCookieDomain('https://example123.com')).toEqual({ + domain: '.example123.com', + secure: true, + }); + }); + }); + + describe('error cases that should break', () => { + it('should throw error for invalid URLs', () => { + expect(() => parseCookieDomain('not-a-url')).toThrow(); + }); + + it('should throw error for URLs without protocol', () => { + expect(() => parseCookieDomain('example.com')).toThrow(); + }); + + it('should throw error for malformed URLs', () => { + expect(() => parseCookieDomain('http://')).toThrow(); + }); + + it('should throw error for URLs with invalid characters', () => { + expect(() => + parseCookieDomain('http://example.com:invalid-port'), + ).toThrow(); + }); + }); + + describe('specific real-world scenarios', () => { + it('should handle openpanel.dev domains correctly', () => { + expect(parseCookieDomain('https://api.openpanel.dev')).toEqual({ + domain: '.openpanel.dev', + secure: true, + }); + }); + + it('should handle dashboard.openpanel.dev domains correctly', () => { + expect(parseCookieDomain('https://dashboard.openpanel.dev')).toEqual({ + domain: '.openpanel.dev', + secure: true, + }); + }); + + it('should handle subdomains of openpanel.dev correctly', () => { + expect( + parseCookieDomain('https://staging.dashboard.openpanel.dev'), + ).toEqual({ + domain: '.openpanel.dev', + secure: true, + }); + }); + + it('should handle custom domains correctly', () => { + expect(parseCookieDomain('https://myapp.com')).toEqual({ + domain: '.myapp.com', + secure: true, + }); + }); + + it('should handle subdomains of custom domains correctly', () => { + expect(parseCookieDomain('https://api.myapp.com')).toEqual({ + domain: '.myapp.com', + secure: true, + }); + }); + }); + + describe('all multi-part TLDs from the list', () => { + const multiPartTLDs = [ + 'co.uk', + 'com.au', + 'co.za', + 'co.nz', + 'co.jp', + 'co.kr', + 'co.in', + 'co.il', + 'com.br', + 'com.mx', + 'com.ar', + 'com.pe', + 'com.cl', + 'com.co', + 'com.ve', + 'net.au', + 'org.au', + 'gov.au', + 'edu.au', + 'net.nz', + 'org.nz', + 'gov.nz', + 'org.uk', + 'gov.uk', + 'ac.uk', + 'nhs.uk', + 'org.za', + 'gov.za', + 'ac.za', + 'ac.jp', + 'or.jp', + 'go.jp', + 'or.kr', + 'go.kr', + 'org.in', + 'gov.in', + 'ac.in', + 'org.il', + 'gov.il', + 'ac.il', + 'net.br', + 'org.br', + 'gov.br', + 'net.mx', + 'org.mx', + 'gov.mx', + 'net.ar', + 'org.ar', + 'gov.ar', + 'net.pe', + 'org.pe', + 'gov.pe', + 'net.cl', + 'org.cl', + 'gov.cl', + 'net.co', + 'org.co', + 'gov.co', + 'net.ve', + 'org.ve', + 'gov.ve', + ]; + + multiPartTLDs.forEach((tld) => { + it(`should handle ${tld} domains correctly`, () => { + expect(parseCookieDomain(`https://example.${tld}`)).toEqual({ + domain: `.example.${tld}`, + secure: true, + }); + }); + + it(`should handle subdomains of ${tld} domains correctly`, () => { + expect(parseCookieDomain(`https://api.example.${tld}`)).toEqual({ + domain: `.example.${tld}`, + secure: true, + }); + }); + }); + }); +}); diff --git a/packages/auth/parse-cookie-domain.ts b/packages/auth/parse-cookie-domain.ts new file mode 100644 index 00000000..36b3f7e8 --- /dev/null +++ b/packages/auth/parse-cookie-domain.ts @@ -0,0 +1,124 @@ +// List of known multi-part TLDs that should be treated as single domains +const MULTI_PART_TLDS = [ + 'co.uk', + 'com.au', + 'co.za', + 'co.nz', + 'co.jp', + 'co.kr', + 'co.in', + 'co.il', + 'com.br', + 'com.mx', + 'com.ar', + 'com.pe', + 'com.cl', + 'com.co', + 'com.ve', + 'net.au', + 'org.au', + 'gov.au', + 'edu.au', + 'net.nz', + 'org.nz', + 'gov.nz', + 'org.uk', + 'gov.uk', + 'ac.uk', + 'nhs.uk', + 'org.za', + 'gov.za', + 'ac.za', + 'ac.jp', + 'or.jp', + 'go.jp', + 'or.kr', + 'go.kr', + 'org.in', + 'gov.in', + 'ac.in', + 'org.il', + 'gov.il', + 'ac.il', + 'net.br', + 'org.br', + 'gov.br', + 'net.mx', + 'org.mx', + 'gov.mx', + 'net.ar', + 'org.ar', + 'gov.ar', + 'net.pe', + 'org.pe', + 'gov.pe', + 'net.cl', + 'org.cl', + 'gov.cl', + 'net.co', + 'org.co', + 'gov.co', + 'net.ve', + 'org.ve', + 'gov.ve', +]; + +export const parseCookieDomain = (url: string) => { + if (!url) { + return { + domain: undefined, + secure: false, + }; + } + + const domain = new URL(url); + const hostname = domain.hostname; + + // For localhost or IP addresses, don't set domain + if (hostname === 'localhost' || /^\d+\.\d+\.\d+\.\d+$/.test(hostname)) { + return { + domain: undefined, + secure: domain.protocol === 'https:', + }; + } + + const parts = hostname.split('.'); + + // Handle multi-part TLDs like co.uk, com.au, etc. + if (parts.length >= 3) { + const potentialTLD = parts.slice(-2).join('.'); + if (MULTI_PART_TLDS.includes(potentialTLD)) { + // For domains like example.co.uk or subdomain.example.co.uk + // Use the last 3 parts: .example.co.uk + return { + domain: `.${parts.slice(-3).join('.')}`, + secure: domain.protocol === 'https:', + }; + } + } + + // For regular subdomains, use the last 2 parts + if (parts.length > 2) { + return { + domain: `.${parts.slice(-2).join('.')}`, + secure: domain.protocol === 'https:', + }; + } + + // For root domains, use the full domain with leading dot + return { + domain: `.${hostname}`, + secure: domain.protocol === 'https:', + }; +}; + +const parsed = parseCookieDomain(process.env.NEXT_PUBLIC_DASHBOARD_URL ?? ''); + +export const COOKIE_MAX_AGE = 60 * 60 * 24 * 30; +export const COOKIE_OPTIONS = { + domain: parsed.domain, + secure: parsed.secure, + sameSite: 'lax', + httpOnly: true, + path: '/', +} as const;