From f21dad1519cbe498b7b93f6ba35a9ae80a3a6514 Mon Sep 17 00:00:00 2001 From: Evan Hahn <69474926+EvanHahn-Signal@users.noreply.github.com> Date: Thu, 8 Oct 2020 11:50:55 -0500 Subject: [PATCH] Mark long hrefs or those with invalid characters as sneaky --- js/modules/link_previews.js | 107 +++++++++----- test/modules/link_previews_test.js | 222 ++++++++++++++++++++--------- 2 files changed, 232 insertions(+), 97 deletions(-) diff --git a/js/modules/link_previews.js b/js/modules/link_previews.js index a92312a2b..1d36e1533 100644 --- a/js/modules/link_previews.js +++ b/js/modules/link_previews.js @@ -1,6 +1,6 @@ /* global URL */ -const { isNumber, compact, isEmpty } = require('lodash'); +const { isNumber, compact, isEmpty, range } = require('lodash'); const nodeUrl = require('url'); const LinkifyIt = require('linkify-it'); @@ -14,14 +14,17 @@ module.exports = { isStickerPack, }; -function isLinkSafeToPreview(link) { - let url; +function maybeParseHref(href) { try { - url = new URL(link); + return new URL(href); } catch (err) { - return false; + return null; } - return url.protocol === 'https:' && !isLinkSneaky(link); +} + +function isLinkSafeToPreview(href) { + const url = maybeParseHref(href); + return Boolean(url && url.protocol === 'https:' && !isLinkSneaky(href)); } function isStickerPack(link) { @@ -52,35 +55,66 @@ function findLinks(text, caretLocation) { ); } -function hasAuth(url) { - try { - const urlObject = new URL(url); - return Boolean(urlObject.username); - } catch (e) { - return null; - } -} - -function getDomain(url) { - try { - const urlObject = new URL(url); - return urlObject.hostname; - } catch (error) { - return null; - } +function getDomain(href) { + const url = maybeParseHref(href); + return url ? url.hostname : null; } +// See . +const VALID_URI_CHARACTERS = new Set([ + '%', + // "gen-delims" + ':', + '/', + '?', + '#', + '[', + ']', + '@', + // "sub-delims" + '!', + '$', + '&', + "'", + '(', + ')', + '*', + '+', + ',', + ';', + '=', + // unreserved + ...String.fromCharCode(...range(65, 91), ...range(97, 123)), + ...range(10).map(String), + '-', + '.', + '_', + '~', +]); const ASCII_PATTERN = new RegExp('[\\u0020-\\u007F]', 'g'); +const MAX_HREF_LENGTH = 2 ** 12; -function isLinkSneaky(link) { - // Any links which contain auth are considered sneaky - if (hasAuth(link)) { +function isLinkSneaky(href) { + // This helps users avoid extremely long links (which could be hiding something + // sketchy) and also sidesteps the performance implications of extremely long hrefs. + if (href.length > MAX_HREF_LENGTH) { + return true; + } + + const url = maybeParseHref(href); + + // If we can't parse it, it's sneaky. + if (!url) { + return true; + } + + // Any links which contain auth are considered sneaky + if (url.username) { return true; } - const domain = getDomain(link); // If the domain is falsy, something fishy is going on - if (!domain) { + if (!url.hostname) { return true; } @@ -89,25 +123,25 @@ function isLinkSneaky(link) { // maximum of 2048. (This also uses the string's `.length` property, // which isn't exactly the same thing as the number of octets.) // [0]: https://tools.ietf.org/html/rfc1034 - if (domain.length > 2048) { + if (url.hostname.length > 2048) { return true; } // Domains cannot contain encoded characters - if (domain.includes('%')) { + if (url.hostname.includes('%')) { return true; } // There must be at least 2 domain labels, and none of them can be empty. - const labels = domain.split('.'); + const labels = url.hostname.split('.'); if (labels.length < 2 || labels.some(isEmpty)) { return true; } // This is necesary because getDomain returns domains in punycode form. const unicodeDomain = nodeUrl.domainToUnicode - ? nodeUrl.domainToUnicode(domain) - : domain; + ? nodeUrl.domainToUnicode(url.hostname) + : url.hostname; const withoutPeriods = unicodeDomain.replace(/\./g, ''); @@ -119,5 +153,12 @@ function isLinkSneaky(link) { return true; } - return false; + // We can't use `url.pathname` (and so on) because it automatically encodes strings. + // For example, it turns `/aquí` into `/aqu%C3%AD`. + const startOfPathAndHash = href.indexOf('/', url.protocol.length + 4); + const pathAndHash = + startOfPathAndHash === -1 ? '' : href.substr(startOfPathAndHash); + return [...pathAndHash].some( + character => !VALID_URI_CHARACTERS.has(character) + ); } diff --git a/test/modules/link_previews_test.js b/test/modules/link_previews_test.js index 8ee6e72d0..315016031 100644 --- a/test/modules/link_previews_test.js +++ b/test/modules/link_previews_test.js @@ -114,49 +114,6 @@ describe('Link previews', () => { }); describe('#isLinkSneaky', () => { - it('returns false for all-latin domain', () => { - const link = 'https://www.amazon.com'; - const actual = isLinkSneaky(link); - assert.strictEqual(actual, false); - }); - - it('returns false for IPv4 addresses', () => { - assert.isFalse(isLinkSneaky('https://127.0.0.1/path')); - }); - - // It's possible that this should return `false` but we'd need to add special logic - // for it. - it('returns true for IPv6 addresses', () => { - assert.isTrue( - isLinkSneaky('https://[2001:0db8:85a3:0000:0000:8a2e:0370:7334]/path') - ); - assert.isTrue(isLinkSneaky('https://[::]/path')); - }); - - it('returns true for Latin + Cyrillic domain', () => { - const link = 'https://www.aмazon.com'; - const actual = isLinkSneaky(link); - assert.strictEqual(actual, true); - }); - - it('returns true for Latin + Greek domain', () => { - const link = 'https://www.αpple.com'; - const actual = isLinkSneaky(link); - assert.strictEqual(actual, true); - }); - - it('returns true for ASCII and non-ASCII mix', () => { - const link = 'https://www.аррӏе.com'; - const actual = isLinkSneaky(link); - assert.strictEqual(actual, true); - }); - - it('returns true for Latin + High Greek domain', () => { - const link = `https://www.apple${String.fromCodePoint(0x101a0)}.com`; - const actual = isLinkSneaky(link); - assert.strictEqual(actual, true); - }); - it('returns true for =', () => { const link = 'r.id=s.id'; assert.strictEqual(isLinkSneaky(link), true); @@ -177,35 +134,172 @@ describe('Link previews', () => { assert.strictEqual(isLinkSneaky(link), true); }); - it('returns true for auth (or pretend auth)', () => { - const link = 'http://whatever.com&login=someuser@77777777'; - assert.strictEqual(isLinkSneaky(link), true); + it('returns true for URLs with a length of 4097 or higher', () => { + const href = `https://example.com/${'a'.repeat(4077)}`; + assert.lengthOf(href, 4097, 'Test href is not the proper length'); + + assert.isTrue(isLinkSneaky(href)); + assert.isTrue(isLinkSneaky(`${href}?foo=bar`)); }); - it("returns true if the domain doesn't contain a .", () => { - assert.isTrue(isLinkSneaky('https://example')); - assert.isTrue(isLinkSneaky('https://localhost')); - assert.isTrue(isLinkSneaky('https://localhost:3000')); + describe('auth', () => { + it('returns true for hrefs with auth (or pretend auth)', () => { + assert.isTrue(isLinkSneaky('https://user:pass@example.com')); + assert.isTrue( + isLinkSneaky('http://whatever.com&login=someuser@77777777') + ); + }); }); - it('returns true if the domain has any empty labels', () => { - assert.isTrue(isLinkSneaky('https://example.')); - assert.isTrue(isLinkSneaky('https://example.com.')); - assert.isTrue(isLinkSneaky('https://.example.com')); - assert.isTrue(isLinkSneaky('https://..example.com')); + describe('domain', () => { + it('returns false for all-latin domain', () => { + const link = 'https://www.amazon.com'; + const actual = isLinkSneaky(link); + assert.strictEqual(actual, false); + }); + + it('returns false for IPv4 addresses', () => { + assert.isFalse(isLinkSneaky('https://127.0.0.1/path')); + }); + + // It's possible that this should return `false` but we'd need to add special logic + // for it. + it('returns true for IPv6 addresses', () => { + assert.isTrue( + isLinkSneaky('https://[2001:0db8:85a3:0000:0000:8a2e:0370:7334]/path') + ); + assert.isTrue(isLinkSneaky('https://[::]/path')); + }); + + it('returns true for Latin + Cyrillic domain', () => { + const link = 'https://www.aмazon.com'; + const actual = isLinkSneaky(link); + assert.strictEqual(actual, true); + }); + + it('returns true for Latin + Greek domain', () => { + const link = 'https://www.αpple.com'; + const actual = isLinkSneaky(link); + assert.strictEqual(actual, true); + }); + + it('returns true for ASCII and non-ASCII mix', () => { + const link = 'https://www.аррӏе.com'; + const actual = isLinkSneaky(link); + assert.strictEqual(actual, true); + }); + + it('returns true for Latin + High Greek domain', () => { + const link = `https://www.apple${String.fromCodePoint(0x101a0)}.com`; + const actual = isLinkSneaky(link); + assert.strictEqual(actual, true); + }); + + it("returns true if the domain doesn't contain a .", () => { + assert.isTrue(isLinkSneaky('https://example')); + assert.isTrue(isLinkSneaky('https://localhost')); + assert.isTrue(isLinkSneaky('https://localhost:3000')); + }); + + it('returns true if the domain has any empty labels', () => { + assert.isTrue(isLinkSneaky('https://example.')); + assert.isTrue(isLinkSneaky('https://example.com.')); + assert.isTrue(isLinkSneaky('https://.example.com')); + assert.isTrue(isLinkSneaky('https://..example.com')); + }); + + it('returns true if the domain is longer than 2048 UTF-16 code points', () => { + const domain = `${'a'.repeat(2041)}.example`; + assert.lengthOf(domain, 2049, 'Test domain is the incorrect length'); + const link = `https://${domain}/foo/bar`; + assert.isTrue(isLinkSneaky(link)); + }); }); - it('returns true if the domain is longer than 2048 UTF-16 code points', () => { - const domain = `${'a'.repeat(2041)}.example`; - assert.lengthOf(domain, 2049, 'Test domain is the incorrect length'); - const link = `https://${domain}/foo/bar`; - assert.isTrue(isLinkSneaky(link)); + describe('pathname', () => { + it('returns false for no pathname', () => { + assert.isFalse(isLinkSneaky('https://example.com')); + assert.isFalse(isLinkSneaky('https://example.com/')); + }); + + it('returns false if the pathname contains valid characters', () => { + assert.isFalse(isLinkSneaky('https://example.com/foo')); + assert.isFalse(isLinkSneaky('https://example.com/foo/bar')); + assert.isFalse( + isLinkSneaky("https://example.com/:/[]@!$&'()*+,;=abc123-._~%") + ); + assert.isFalse( + isLinkSneaky( + 'https://lbry.tv/@ScammerRevolts:b0/DELETING-EVERY-FILE-OFF-A-SCAMMERS-LAPTOP-Destroyed:1' + ) + ); + }); + + it('returns true if the pathname contains invalid characters', () => { + assert.isTrue(isLinkSneaky('https://example.com/hello world')); + assert.isTrue(isLinkSneaky('https://example.com/aquí-está')); + assert.isTrue(isLinkSneaky('https://example.com/hello\x00world')); + assert.isTrue(isLinkSneaky('https://example.com/hello\nworld')); + assert.isTrue(isLinkSneaky('https://example.com/hello😈world')); + }); }); - it('returns false for regular @ in url', () => { - const link = - 'https://lbry.tv/@ScammerRevolts:b0/DELETING-EVERY-FILE-OFF-A-SCAMMERS-LAPTOP-Destroyed:1'; - assert.strictEqual(isLinkSneaky(link), false); + describe('query string', () => { + it('returns false for no query', () => { + assert.isFalse(isLinkSneaky('https://example.com/foo')); + assert.isFalse(isLinkSneaky('https://example.com/foo?')); + }); + + it('returns false if the query string contains valid characters', () => { + assert.isFalse(isLinkSneaky('https://example.com/foo?bar')); + assert.isFalse(isLinkSneaky('https://example.com/foo?bar=baz')); + assert.isFalse( + isLinkSneaky( + "https://example.com/foo?bar=:/[]@!$&'()*+,;=abc123-._~%" + ) + ); + assert.isFalse( + isLinkSneaky( + "https://example.com/foo?:/[]@!$&'()*+,;=abc123-._~%=baz" + ) + ); + }); + + it('returns true if the query string contains invalid characters', () => { + assert.isTrue(isLinkSneaky('https://example.com/foo?bar baz')); + assert.isTrue(isLinkSneaky('https://example.com/foo?bar baz=qux')); + assert.isTrue(isLinkSneaky('https://example.com/foo?bar=baz qux')); + assert.isTrue(isLinkSneaky('https://example.com/foo?aquí=está')); + assert.isTrue(isLinkSneaky('https://example.com/foo?hello=\x00world')); + assert.isTrue( + isLinkSneaky('https://example.com/foo?hello=hello\nworld') + ); + assert.isTrue(isLinkSneaky('https://example.com/foo?hello=😈world')); + }); + }); + + describe('hash', () => { + it('returns false for no hash', () => { + assert.isFalse(isLinkSneaky('https://example.com/foo')); + assert.isFalse(isLinkSneaky('https://example.com/foo#')); + }); + + it('returns false if the hash contains valid characters', () => { + assert.isFalse(isLinkSneaky('https://example.com/foo#bar')); + assert.isFalse( + isLinkSneaky("https://example.com/foo#:/[]@!$&'()*+,;=abc123-._~%") + ); + }); + + it('returns true if the hash contains invalid characters', () => { + assert.isTrue(isLinkSneaky('https://example.com/foo#bar baz')); + assert.isTrue(isLinkSneaky('https://example.com/foo#bar baz=qux')); + assert.isTrue(isLinkSneaky('https://example.com/foo#bar=baz qux')); + assert.isTrue(isLinkSneaky('https://example.com/foo#aquí_está')); + assert.isTrue(isLinkSneaky('https://example.com/foo#hello\x00world')); + assert.isTrue(isLinkSneaky('https://example.com/foo#hello\nworld')); + assert.isTrue(isLinkSneaky('https://example.com/foo#hello😈world')); + }); }); }); });