From f21dad1519cbe498b7b93f6ba35a9ae80a3a6514 Mon Sep 17 00:00:00 2001
From: Evan Hahn <69474926+EvanHahn-Signal@users.noreply.github.com>
Date: Thu, 8 Oct 2020 11:50:55 -0500
Subject: [PATCH] Mark long hrefs or those with invalid characters as sneaky

---
 js/modules/link_previews.js        | 107 +++++++++-----
 test/modules/link_previews_test.js | 222 ++++++++++++++++++++---------
 2 files changed, 232 insertions(+), 97 deletions(-)

diff --git a/js/modules/link_previews.js b/js/modules/link_previews.js
index a92312a2b..1d36e1533 100644
--- a/js/modules/link_previews.js
+++ b/js/modules/link_previews.js
@@ -1,6 +1,6 @@
 /* global URL */
 
-const { isNumber, compact, isEmpty } = require('lodash');
+const { isNumber, compact, isEmpty, range } = require('lodash');
 const nodeUrl = require('url');
 const LinkifyIt = require('linkify-it');
 
@@ -14,14 +14,17 @@ module.exports = {
   isStickerPack,
 };
 
-function isLinkSafeToPreview(link) {
-  let url;
+function maybeParseHref(href) {
   try {
-    url = new URL(link);
+    return new URL(href);
   } catch (err) {
-    return false;
+    return null;
   }
-  return url.protocol === 'https:' && !isLinkSneaky(link);
+}
+
+function isLinkSafeToPreview(href) {
+  const url = maybeParseHref(href);
+  return Boolean(url && url.protocol === 'https:' && !isLinkSneaky(href));
 }
 
 function isStickerPack(link) {
@@ -52,35 +55,66 @@ function findLinks(text, caretLocation) {
   );
 }
 
-function hasAuth(url) {
-  try {
-    const urlObject = new URL(url);
-    return Boolean(urlObject.username);
-  } catch (e) {
-    return null;
-  }
-}
-
-function getDomain(url) {
-  try {
-    const urlObject = new URL(url);
-    return urlObject.hostname;
-  } catch (error) {
-    return null;
-  }
+function getDomain(href) {
+  const url = maybeParseHref(href);
+  return url ? url.hostname : null;
 }
 
+// See <https://tools.ietf.org/html/rfc3986>.
+const VALID_URI_CHARACTERS = new Set([
+  '%',
+  // "gen-delims"
+  ':',
+  '/',
+  '?',
+  '#',
+  '[',
+  ']',
+  '@',
+  // "sub-delims"
+  '!',
+  '$',
+  '&',
+  "'",
+  '(',
+  ')',
+  '*',
+  '+',
+  ',',
+  ';',
+  '=',
+  // unreserved
+  ...String.fromCharCode(...range(65, 91), ...range(97, 123)),
+  ...range(10).map(String),
+  '-',
+  '.',
+  '_',
+  '~',
+]);
 const ASCII_PATTERN = new RegExp('[\\u0020-\\u007F]', 'g');
+const MAX_HREF_LENGTH = 2 ** 12;
 
-function isLinkSneaky(link) {
-  // Any links which contain auth are considered sneaky
-  if (hasAuth(link)) {
+function isLinkSneaky(href) {
+  // This helps users avoid extremely long links (which could be hiding something
+  //   sketchy) and also sidesteps the performance implications of extremely long hrefs.
+  if (href.length > MAX_HREF_LENGTH) {
+    return true;
+  }
+
+  const url = maybeParseHref(href);
+
+  // If we can't parse it, it's sneaky.
+  if (!url) {
+    return true;
+  }
+
+  // Any links which contain auth are considered sneaky
+  if (url.username) {
     return true;
   }
 
-  const domain = getDomain(link);
   // If the domain is falsy, something fishy is going on
-  if (!domain) {
+  if (!url.hostname) {
     return true;
   }
 
@@ -89,25 +123,25 @@ function isLinkSneaky(link) {
   //   maximum of 2048. (This also uses the string's `.length` property,
   //   which isn't exactly the same thing as the number of octets.)
   // [0]: https://tools.ietf.org/html/rfc1034
-  if (domain.length > 2048) {
+  if (url.hostname.length > 2048) {
     return true;
   }
 
   // Domains cannot contain encoded characters
-  if (domain.includes('%')) {
+  if (url.hostname.includes('%')) {
     return true;
   }
 
   // There must be at least 2 domain labels, and none of them can be empty.
-  const labels = domain.split('.');
+  const labels = url.hostname.split('.');
   if (labels.length < 2 || labels.some(isEmpty)) {
     return true;
   }
 
   // This is necesary because getDomain returns domains in punycode form.
   const unicodeDomain = nodeUrl.domainToUnicode
-    ? nodeUrl.domainToUnicode(domain)
-    : domain;
+    ? nodeUrl.domainToUnicode(url.hostname)
+    : url.hostname;
 
   const withoutPeriods = unicodeDomain.replace(/\./g, '');
 
@@ -119,5 +153,12 @@ function isLinkSneaky(link) {
     return true;
   }
 
-  return false;
+  // We can't use `url.pathname` (and so on) because it automatically encodes strings.
+  //   For example, it turns `/aquí` into `/aqu%C3%AD`.
+  const startOfPathAndHash = href.indexOf('/', url.protocol.length + 4);
+  const pathAndHash =
+    startOfPathAndHash === -1 ? '' : href.substr(startOfPathAndHash);
+  return [...pathAndHash].some(
+    character => !VALID_URI_CHARACTERS.has(character)
+  );
 }
diff --git a/test/modules/link_previews_test.js b/test/modules/link_previews_test.js
index 8ee6e72d0..315016031 100644
--- a/test/modules/link_previews_test.js
+++ b/test/modules/link_previews_test.js
@@ -114,49 +114,6 @@ describe('Link previews', () => {
   });
 
   describe('#isLinkSneaky', () => {
-    it('returns false for all-latin domain', () => {
-      const link = 'https://www.amazon.com';
-      const actual = isLinkSneaky(link);
-      assert.strictEqual(actual, false);
-    });
-
-    it('returns false for IPv4 addresses', () => {
-      assert.isFalse(isLinkSneaky('https://127.0.0.1/path'));
-    });
-
-    // It's possible that this should return `false` but we'd need to add special logic
-    //   for it.
-    it('returns true for IPv6 addresses', () => {
-      assert.isTrue(
-        isLinkSneaky('https://[2001:0db8:85a3:0000:0000:8a2e:0370:7334]/path')
-      );
-      assert.isTrue(isLinkSneaky('https://[::]/path'));
-    });
-
-    it('returns true for Latin + Cyrillic domain', () => {
-      const link = 'https://www.aмazon.com';
-      const actual = isLinkSneaky(link);
-      assert.strictEqual(actual, true);
-    });
-
-    it('returns true for Latin + Greek domain', () => {
-      const link = 'https://www.αpple.com';
-      const actual = isLinkSneaky(link);
-      assert.strictEqual(actual, true);
-    });
-
-    it('returns true for ASCII and non-ASCII mix', () => {
-      const link = 'https://www.аррӏе.com';
-      const actual = isLinkSneaky(link);
-      assert.strictEqual(actual, true);
-    });
-
-    it('returns true for Latin + High Greek domain', () => {
-      const link = `https://www.apple${String.fromCodePoint(0x101a0)}.com`;
-      const actual = isLinkSneaky(link);
-      assert.strictEqual(actual, true);
-    });
-
     it('returns true for =', () => {
       const link = 'r.id=s.id';
       assert.strictEqual(isLinkSneaky(link), true);
@@ -177,35 +134,172 @@ describe('Link previews', () => {
       assert.strictEqual(isLinkSneaky(link), true);
     });
 
-    it('returns true for auth (or pretend auth)', () => {
-      const link = 'http://whatever.com&login=someuser@77777777';
-      assert.strictEqual(isLinkSneaky(link), true);
+    it('returns true for URLs with a length of 4097 or higher', () => {
+      const href = `https://example.com/${'a'.repeat(4077)}`;
+      assert.lengthOf(href, 4097, 'Test href is not the proper length');
+
+      assert.isTrue(isLinkSneaky(href));
+      assert.isTrue(isLinkSneaky(`${href}?foo=bar`));
     });
 
-    it("returns true if the domain doesn't contain a .", () => {
-      assert.isTrue(isLinkSneaky('https://example'));
-      assert.isTrue(isLinkSneaky('https://localhost'));
-      assert.isTrue(isLinkSneaky('https://localhost:3000'));
+    describe('auth', () => {
+      it('returns true for hrefs with auth (or pretend auth)', () => {
+        assert.isTrue(isLinkSneaky('https://user:pass@example.com'));
+        assert.isTrue(
+          isLinkSneaky('http://whatever.com&login=someuser@77777777')
+        );
+      });
     });
 
-    it('returns true if the domain has any empty labels', () => {
-      assert.isTrue(isLinkSneaky('https://example.'));
-      assert.isTrue(isLinkSneaky('https://example.com.'));
-      assert.isTrue(isLinkSneaky('https://.example.com'));
-      assert.isTrue(isLinkSneaky('https://..example.com'));
+    describe('domain', () => {
+      it('returns false for all-latin domain', () => {
+        const link = 'https://www.amazon.com';
+        const actual = isLinkSneaky(link);
+        assert.strictEqual(actual, false);
+      });
+
+      it('returns false for IPv4 addresses', () => {
+        assert.isFalse(isLinkSneaky('https://127.0.0.1/path'));
+      });
+
+      // It's possible that this should return `false` but we'd need to add special logic
+      //   for it.
+      it('returns true for IPv6 addresses', () => {
+        assert.isTrue(
+          isLinkSneaky('https://[2001:0db8:85a3:0000:0000:8a2e:0370:7334]/path')
+        );
+        assert.isTrue(isLinkSneaky('https://[::]/path'));
+      });
+
+      it('returns true for Latin + Cyrillic domain', () => {
+        const link = 'https://www.aмazon.com';
+        const actual = isLinkSneaky(link);
+        assert.strictEqual(actual, true);
+      });
+
+      it('returns true for Latin + Greek domain', () => {
+        const link = 'https://www.αpple.com';
+        const actual = isLinkSneaky(link);
+        assert.strictEqual(actual, true);
+      });
+
+      it('returns true for ASCII and non-ASCII mix', () => {
+        const link = 'https://www.аррӏе.com';
+        const actual = isLinkSneaky(link);
+        assert.strictEqual(actual, true);
+      });
+
+      it('returns true for Latin + High Greek domain', () => {
+        const link = `https://www.apple${String.fromCodePoint(0x101a0)}.com`;
+        const actual = isLinkSneaky(link);
+        assert.strictEqual(actual, true);
+      });
+
+      it("returns true if the domain doesn't contain a .", () => {
+        assert.isTrue(isLinkSneaky('https://example'));
+        assert.isTrue(isLinkSneaky('https://localhost'));
+        assert.isTrue(isLinkSneaky('https://localhost:3000'));
+      });
+
+      it('returns true if the domain has any empty labels', () => {
+        assert.isTrue(isLinkSneaky('https://example.'));
+        assert.isTrue(isLinkSneaky('https://example.com.'));
+        assert.isTrue(isLinkSneaky('https://.example.com'));
+        assert.isTrue(isLinkSneaky('https://..example.com'));
+      });
+
+      it('returns true if the domain is longer than 2048 UTF-16 code points', () => {
+        const domain = `${'a'.repeat(2041)}.example`;
+        assert.lengthOf(domain, 2049, 'Test domain is the incorrect length');
+        const link = `https://${domain}/foo/bar`;
+        assert.isTrue(isLinkSneaky(link));
+      });
     });
 
-    it('returns true if the domain is longer than 2048 UTF-16 code points', () => {
-      const domain = `${'a'.repeat(2041)}.example`;
-      assert.lengthOf(domain, 2049, 'Test domain is the incorrect length');
-      const link = `https://${domain}/foo/bar`;
-      assert.isTrue(isLinkSneaky(link));
+    describe('pathname', () => {
+      it('returns false for no pathname', () => {
+        assert.isFalse(isLinkSneaky('https://example.com'));
+        assert.isFalse(isLinkSneaky('https://example.com/'));
+      });
+
+      it('returns false if the pathname contains valid characters', () => {
+        assert.isFalse(isLinkSneaky('https://example.com/foo'));
+        assert.isFalse(isLinkSneaky('https://example.com/foo/bar'));
+        assert.isFalse(
+          isLinkSneaky("https://example.com/:/[]@!$&'()*+,;=abc123-._~%")
+        );
+        assert.isFalse(
+          isLinkSneaky(
+            'https://lbry.tv/@ScammerRevolts:b0/DELETING-EVERY-FILE-OFF-A-SCAMMERS-LAPTOP-Destroyed:1'
+          )
+        );
+      });
+
+      it('returns true if the pathname contains invalid characters', () => {
+        assert.isTrue(isLinkSneaky('https://example.com/hello world'));
+        assert.isTrue(isLinkSneaky('https://example.com/aquí-está'));
+        assert.isTrue(isLinkSneaky('https://example.com/hello\x00world'));
+        assert.isTrue(isLinkSneaky('https://example.com/hello\nworld'));
+        assert.isTrue(isLinkSneaky('https://example.com/hello😈world'));
+      });
     });
 
-    it('returns false for regular @ in url', () => {
-      const link =
-        'https://lbry.tv/@ScammerRevolts:b0/DELETING-EVERY-FILE-OFF-A-SCAMMERS-LAPTOP-Destroyed:1';
-      assert.strictEqual(isLinkSneaky(link), false);
+    describe('query string', () => {
+      it('returns false for no query', () => {
+        assert.isFalse(isLinkSneaky('https://example.com/foo'));
+        assert.isFalse(isLinkSneaky('https://example.com/foo?'));
+      });
+
+      it('returns false if the query string contains valid characters', () => {
+        assert.isFalse(isLinkSneaky('https://example.com/foo?bar'));
+        assert.isFalse(isLinkSneaky('https://example.com/foo?bar=baz'));
+        assert.isFalse(
+          isLinkSneaky(
+            "https://example.com/foo?bar=:/[]@!$&'()*+,;=abc123-._~%"
+          )
+        );
+        assert.isFalse(
+          isLinkSneaky(
+            "https://example.com/foo?:/[]@!$&'()*+,;=abc123-._~%=baz"
+          )
+        );
+      });
+
+      it('returns true if the query string contains invalid characters', () => {
+        assert.isTrue(isLinkSneaky('https://example.com/foo?bar baz'));
+        assert.isTrue(isLinkSneaky('https://example.com/foo?bar baz=qux'));
+        assert.isTrue(isLinkSneaky('https://example.com/foo?bar=baz qux'));
+        assert.isTrue(isLinkSneaky('https://example.com/foo?aquí=está'));
+        assert.isTrue(isLinkSneaky('https://example.com/foo?hello=\x00world'));
+        assert.isTrue(
+          isLinkSneaky('https://example.com/foo?hello=hello\nworld')
+        );
+        assert.isTrue(isLinkSneaky('https://example.com/foo?hello=😈world'));
+      });
+    });
+
+    describe('hash', () => {
+      it('returns false for no hash', () => {
+        assert.isFalse(isLinkSneaky('https://example.com/foo'));
+        assert.isFalse(isLinkSneaky('https://example.com/foo#'));
+      });
+
+      it('returns false if the hash contains valid characters', () => {
+        assert.isFalse(isLinkSneaky('https://example.com/foo#bar'));
+        assert.isFalse(
+          isLinkSneaky("https://example.com/foo#:/[]@!$&'()*+,;=abc123-._~%")
+        );
+      });
+
+      it('returns true if the hash contains invalid characters', () => {
+        assert.isTrue(isLinkSneaky('https://example.com/foo#bar baz'));
+        assert.isTrue(isLinkSneaky('https://example.com/foo#bar baz=qux'));
+        assert.isTrue(isLinkSneaky('https://example.com/foo#bar=baz qux'));
+        assert.isTrue(isLinkSneaky('https://example.com/foo#aquí_está'));
+        assert.isTrue(isLinkSneaky('https://example.com/foo#hello\x00world'));
+        assert.isTrue(isLinkSneaky('https://example.com/foo#hello\nworld'));
+        assert.isTrue(isLinkSneaky('https://example.com/foo#hello😈world'));
+      });
     });
   });
 });