feat(utils): add extractUrlsFromText util

This commit is contained in:
tomasklim
2024-07-30 16:06:57 +02:00
committed by Tomáš Klíma
parent a9d04a5604
commit 15e03f9cf2
3 changed files with 147 additions and 0 deletions

View File

@@ -0,0 +1,35 @@
const URL_REGEX =
/\b(?:https?:\/\/|www\.)[a-zA-Z0-9-._~:/?#[\]@!$&'()*+,;=%]+\b|(?:[a-zA-Z0-9-]+\.)+[a-zA-Z]{2,}(?=\b|\s|$|\])/gi;
export const extractUrlsFromText = (text: string) => {
const urls: string[] = [];
const textParts: string[] = [];
let lastIndex = 0;
const matches = [...text.matchAll(URL_REGEX)];
matches.forEach(match => {
const url = match[0];
const index = match.index !== undefined ? match.index : -1; // Ensure index is defined
// Capture text before the URL
if (lastIndex < index) {
textParts.push(text.slice(lastIndex, index));
}
// Capture the URL itself
urls.push(url);
lastIndex = index + url.length;
});
// Capture any remaining text after the last URL
if (lastIndex < text.length) {
textParts.push(text.slice(lastIndex));
}
// Special case: if there's no text before or after, ensure the array is non-empty
if (textParts.length === 0 && urls.length > 0) {
textParts.push('');
}
return { textParts, urls };
};

View File

@@ -43,3 +43,4 @@ export * from './logs';
export * from './logsManager';
export * from './bigNumber';
export * from './throttler';
export * from './extractUrlsFromText';

View File

@@ -0,0 +1,111 @@
import { extractUrlsFromText } from '../src/extractUrlsFromText';
describe('extractUrlsFromText', () => {
it('should return textParts and urls for text with URLs', () => {
const text =
'Go to this page to claim your rewards: http://example.com/url and also check out www.phishing-site.com';
const { textParts, urls } = extractUrlsFromText(text);
expect(textParts).toEqual([
'Go to this page to claim your rewards: ',
' and also check out ',
]);
expect(urls).toEqual(['http://example.com/url', 'www.phishing-site.com']);
});
it('should handle text without URLs', () => {
const text = 'This is a message very nice Ethereum token';
const { textParts, urls } = extractUrlsFromText(text);
expect(textParts).toEqual([text]);
expect(urls).toEqual([]);
});
it('should not match invalid URLs like "2.0" in token name', () => {
const text = 'Liquid staked Ether 2.0';
const { textParts, urls } = extractUrlsFromText(text);
expect(textParts).toEqual([text]);
expect(urls).toEqual([]);
});
it('should handle text with multiple URLs related to Ethereum tokens correctly', () => {
const text =
'Visit https://etherscan.io, http://mycrypto.com, and www.ethereum.org to claim your tokens.';
const { textParts, urls } = extractUrlsFromText(text);
expect(textParts).toEqual(['Visit ', ', ', ', and ', ' to claim your tokens.']);
expect(urls).toEqual(['https://etherscan.io', 'http://mycrypto.com', 'www.ethereum.org']);
});
it('should correctly extract URLs from phishing messages related to Ethereum', () => {
const text =
'Attention! Go to http://phishing-site.com to secure your Ethereum wallet immediately.';
const { textParts, urls } = extractUrlsFromText(text);
expect(textParts).toEqual([
'Attention! Go to ',
' to secure your Ethereum wallet immediately.',
]);
expect(urls).toEqual(['http://phishing-site.com']);
});
it('should not match any url in case of USDT name', () => {
const text = 'Tether USD';
const { textParts, urls } = extractUrlsFromText(text);
expect(textParts).toEqual([text]);
expect(urls).toEqual([]);
});
it('should not match any url in case of USDT symbol', () => {
const text = 'USDT';
const { textParts, urls } = extractUrlsFromText(text);
expect(textParts).toEqual([text]);
expect(urls).toEqual([]);
});
it('should match url in case of just url', () => {
const text = 'USDT.io';
const { textParts, urls } = extractUrlsFromText(text);
expect(textParts).toEqual(['']);
expect(urls).toEqual([text]);
});
it('should match url in case of scam', () => {
const text = '$ USDCXmas.com';
const { textParts, urls } = extractUrlsFromText(text);
expect(textParts).toEqual(['$ ']);
expect(urls).toEqual(['USDCXmas.com']);
});
it('should match url in case scam name with emoji', () => {
const text = '🎁10K$ gift at [bit.ly/tpepe]';
const { textParts, urls } = extractUrlsFromText(text);
expect(textParts).toEqual(['🎁10K$ gift at [', '/tpepe]']);
expect(urls).toEqual(['bit.ly']);
});
it('should not match url in of contract address', () => {
const text = '0xcDa4e840411C00a614aD9205CAEC807c7458a0E3';
const { textParts, urls } = extractUrlsFromText(text);
expect(textParts).toEqual(['0xcDa4e840411C00a614aD9205CAEC807c7458a0E3']);
expect(urls).toEqual([]);
});
it('should match two urls next to each other', () => {
const text = 'Visit trezor.io ledger.com';
const { textParts, urls } = extractUrlsFromText(text);
expect(textParts).toEqual(['Visit ', ' ']);
expect(urls).toEqual(['trezor.io', 'ledger.com']);
});
});