feat(utils): add extractUrlsFromText util

2026-03-03 05:55:03 +01:00 · 2024-07-30 16:06:57 +02:00
parent a9d04a5604
commit 15e03f9cf2
3 changed files with 147 additions and 0 deletions
--- a/packages/utils/src/extractUrlsFromText.ts
+++ b/packages/utils/src/extractUrlsFromText.ts
@@ -0,0 +1,35 @@
+const URL_REGEX =
+    /\b(?:https?:\/\/|www\.)[a-zA-Z0-9-._~:/?#[\]@!$&'()*+,;=%]+\b|(?:[a-zA-Z0-9-]+\.)+[a-zA-Z]{2,}(?=\b|\s|$|\])/gi;
+
+export const extractUrlsFromText = (text: string) => {
+    const urls: string[] = [];
+    const textParts: string[] = [];
+    let lastIndex = 0;
+
+    const matches = [...text.matchAll(URL_REGEX)];
+
+    matches.forEach(match => {
+        const url = match[0];
+        const index = match.index !== undefined ? match.index : -1; // Ensure index is defined
+
+        // Capture text before the URL
+        if (lastIndex < index) {
+            textParts.push(text.slice(lastIndex, index));
+        }
+        // Capture the URL itself
+        urls.push(url);
+        lastIndex = index + url.length;
+    });
+
+    // Capture any remaining text after the last URL
+    if (lastIndex < text.length) {
+        textParts.push(text.slice(lastIndex));
+    }
+
+    // Special case: if there's no text before or after, ensure the array is non-empty
+    if (textParts.length === 0 && urls.length > 0) {
+        textParts.push('');
+    }
+
+    return { textParts, urls };
+};
--- a/packages/utils/src/index.ts
+++ b/packages/utils/src/index.ts
@@ -43,3 +43,4 @@ export * from './logs';
 export * from './logsManager';
 export * from './bigNumber';
 export * from './throttler';
+export * from './extractUrlsFromText';
--- a/packages/utils/tests/extractUrlsFromText.test.ts
+++ b/packages/utils/tests/extractUrlsFromText.test.ts
@@ -0,0 +1,111 @@
+import { extractUrlsFromText } from '../src/extractUrlsFromText';
+
+describe('extractUrlsFromText', () => {
+    it('should return textParts and urls for text with URLs', () => {
+        const text =
+            'Go to this page to claim your rewards: http://example.com/url and also check out www.phishing-site.com';
+        const { textParts, urls } = extractUrlsFromText(text);
+
+        expect(textParts).toEqual([
+            'Go to this page to claim your rewards: ',
+            ' and also check out ',
+        ]);
+        expect(urls).toEqual(['http://example.com/url', 'www.phishing-site.com']);
+    });
+
+    it('should handle text without URLs', () => {
+        const text = 'This is a message very nice Ethereum token';
+        const { textParts, urls } = extractUrlsFromText(text);
+
+        expect(textParts).toEqual([text]);
+        expect(urls).toEqual([]);
+    });
+
+    it('should not match invalid URLs like "2.0" in token name', () => {
+        const text = 'Liquid staked Ether 2.0';
+        const { textParts, urls } = extractUrlsFromText(text);
+
+        expect(textParts).toEqual([text]);
+        expect(urls).toEqual([]);
+    });
+
+    it('should handle text with multiple URLs related to Ethereum tokens correctly', () => {
+        const text =
+            'Visit https://etherscan.io, http://mycrypto.com, and www.ethereum.org to claim your tokens.';
+        const { textParts, urls } = extractUrlsFromText(text);
+
+        expect(textParts).toEqual(['Visit ', ', ', ', and ', ' to claim your tokens.']);
+        expect(urls).toEqual(['https://etherscan.io', 'http://mycrypto.com', 'www.ethereum.org']);
+    });
+
+    it('should correctly extract URLs from phishing messages related to Ethereum', () => {
+        const text =
+            'Attention! Go to http://phishing-site.com to secure your Ethereum wallet immediately.';
+        const { textParts, urls } = extractUrlsFromText(text);
+
+        expect(textParts).toEqual([
+            'Attention! Go to ',
+            ' to secure your Ethereum wallet immediately.',
+        ]);
+        expect(urls).toEqual(['http://phishing-site.com']);
+    });
+
+    it('should not match any url in case of USDT name', () => {
+        const text = 'Tether USD';
+        const { textParts, urls } = extractUrlsFromText(text);
+
+        expect(textParts).toEqual([text]);
+        expect(urls).toEqual([]);
+    });
+
+    it('should not match any url in case of USDT symbol', () => {
+        const text = 'USDT';
+        const { textParts, urls } = extractUrlsFromText(text);
+
+        expect(textParts).toEqual([text]);
+        expect(urls).toEqual([]);
+    });
+
+    it('should match url in case of just url', () => {
+        const text = 'USDT.io';
+        const { textParts, urls } = extractUrlsFromText(text);
+
+        expect(textParts).toEqual(['']);
+        expect(urls).toEqual([text]);
+    });
+
+    it('should match url in case of scam', () => {
+        const text = '$ USDCXmas.com';
+        const { textParts, urls } = extractUrlsFromText(text);
+
+        expect(textParts).toEqual(['$ ']);
+        expect(urls).toEqual(['USDCXmas.com']);
+    });
+
+    it('should match url in case scam name with emoji', () => {
+        const text = '🎁10K$ gift at [bit.ly/tpepe]';
+
+        const { textParts, urls } = extractUrlsFromText(text);
+
+        expect(textParts).toEqual(['🎁10K$ gift at [', '/tpepe]']);
+        expect(urls).toEqual(['bit.ly']);
+    });
+
+    it('should not match url in of contract address', () => {
+        const text = '0xcDa4e840411C00a614aD9205CAEC807c7458a0E3';
+
+        const { textParts, urls } = extractUrlsFromText(text);
+
+        expect(textParts).toEqual(['0xcDa4e840411C00a614aD9205CAEC807c7458a0E3']);
+        expect(urls).toEqual([]);
+    });
+
+    it('should match two urls next to each other', () => {
+        const text = 'Visit trezor.io ledger.com';
+
+        const { textParts, urls } = extractUrlsFromText(text);
+
+        expect(textParts).toEqual(['Visit ', ' ']);
+        expect(urls).toEqual(['trezor.io', 'ledger.com']);
+    });
+});