From c86ed5287b6277ee0768125d145b868519039537 Mon Sep 17 00:00:00 2001
From: Raymond Hill <rhill@raymondhill.net>
Date: Thu, 10 Oct 2024 11:11:07 -0400
Subject: [PATCH] Add regex extraction transformation step to `urlskip=` option

Related feedback:
https://github.com/uBlockOrigin/uBlock-issues/issues/3206#issuecomment-2403795984

The first capture group of the regex will be used as the result of the
transformation. Example:

||podtrac.com/pts/redirect.mp3/$urlskip=/podtrac\.com\/pts\/redirect\.mp3\/(.*?\.mp3\b)/ +https

If the regex is invalid, or if it fails to extract a first capture
group, no redirection will occur.
---
 src/js/static-net-filtering.js | 101 +++++++++++++++++++++++++--------
 1 file changed, 76 insertions(+), 25 deletions(-)

diff --git a/src/js/static-net-filtering.js b/src/js/static-net-filtering.js
index 194229573..9f50e0b16 100644
--- a/src/js/static-net-filtering.js
+++ b/src/js/static-net-filtering.js
@@ -5356,12 +5356,10 @@ StaticNetFilteringEngine.prototype.transformRequest = function(fctxt, out = [])
             out.push(directive);
             continue;
         }
-        const { refs } = directive;
-        if ( refs instanceof Object === false ) { continue; }
-        if ( refs.$cache === null ) {
-            refs.$cache = sfp.parseReplaceValue(refs.value);
+        if ( directive.cache === null ) {
+            directive.cache = sfp.parseReplaceValue(directive.value);
         }
-        const cache = refs.$cache;
+        const cache = directive.cache;
         if ( cache === undefined ) { continue; }
         const before = `${redirectURL.pathname}${redirectURL.search}${redirectURL.hash}`;
         if ( cache.re.test(before) !== true ) { continue; }
@@ -5382,7 +5380,49 @@ StaticNetFilteringEngine.prototype.transformRequest = function(fctxt, out = [])
     return out;
 };
 
-/******************************************************************************/
+/**
+ * @trustedOption urlkip
+ * 
+ * @description
+ * Extract a URL from another URL according to one or more transformation steps,
+ * thereby skipping over intermediate network request(s) to remote servers.
+ * Requires a trusted source.
+ * 
+ * @param steps
+ * A serie of space-separated directives representing the transformation steps
+ * to perform to extract the final URL to which a network request should be
+ * redirected.
+ * 
+ * Supported directives:
+ * 
+ * `?name`: extract the value of parameter `name` as the current string.
+ * 
+ * `&i`: extract the name of the parameter at position `i` as the current
+ *   string. The position is 1-based.
+ * 
+ * `/.../`: extract the first capture group of a regex as the current string.
+ * 
+ * `+https`: prepend the current string with `https://`.
+ * 
+ * `-base64`: decode the current string as a base64-encoded string.
+ * 
+ * At any given step, the currently extracted string may not necessarily be
+ * a valid URL, and more transformation steps may be needed to obtain a valid
+ * URL once all the steps are applied.
+ * 
+ * An unsupported step or a failed step will abort the transformation and no
+ * redirection will be performed.
+ * 
+ * The final step is expected to yield a valid URL. If the result is not a
+ * valid URL, no redirection will be performed.
+ * 
+ * @example
+ * ||example.com/path/to/tracker$urlskip=?url
+ * ||example.com/path/to/tracker$urlskip=?url ?to
+ * ||pixiv.net/jump.php?$urlskip=&1
+ * ||podtrac.com/pts/redirect.mp3/$urlskip=/podtrac\.com\/pts\/redirect\.mp3\/(.*?\.mp3\b)/ +https
+ * 
+ * */
 
 StaticNetFilteringEngine.prototype.urlSkip = function(fctxt, out = []) {
     if ( fctxt.redirectURL !== undefined ) { return; }
@@ -5396,7 +5436,7 @@ StaticNetFilteringEngine.prototype.urlSkip = function(fctxt, out = []) {
         const urlin = fctxt.url;
         const value = directive.value;
         const steps = value.includes(' ') && value.split(/ +/) || [ value ];
-        const urlout = urlSkip(urlin, steps);
+        const urlout = urlSkip(directive, urlin, steps);
         if ( urlout === undefined ) { continue; }
         if ( urlout === urlin ) { continue; }
         fctxt.redirectURL = urlout;
@@ -5407,41 +5447,52 @@ StaticNetFilteringEngine.prototype.urlSkip = function(fctxt, out = []) {
     return out;
 };
 
-function urlSkip(urlin, steps) {
+function urlSkip(directive, urlin, steps) {
     try {
-        let urlout;
+        let urlout = urlin;
         for ( const step of steps ) {
+            const urlin = urlout;
             const c0 = step.charCodeAt(0);
-            // Extract from URL parameter
-            if ( c0 === 0x3F ) { /* ? */
-                urlout = (new URL(urlin)).searchParams.get(step.slice(1));
-                if ( urlout === null ) { return; }
-                if ( urlout.includes(' ') ) {
-                    urlout = urlout.replace(/ /g, '%20');
-                }
-                urlin = urlout;
-                continue;
-            }
             // Extract from URL parameter name at position i
-            if ( c0 === 0x26 ) { /* & */
+            if ( c0 === 0x26 ) { // &
                 const i = (parseInt(step.slice(1)) || 0) - 1;
                 if ( i < 0 ) { return; }
                 const url = new URL(urlin);
                 if ( i >= url.searchParams.size ) { return; }
                 const params = Array.from(url.searchParams.keys());
-                urlin = urlout = decodeURIComponent(params[i]);
+                urlout = decodeURIComponent(params[i]);
                 continue;
             }
             // Enforce https
-            if ( step === '+https' ) {
+            if ( c0 === 0x2B && step === '+https' ) {
                 const s = urlin.replace(/^https?:\/\//, '');
                 if ( /^[\w-]:\/\//.test(s) ) { return; }
-                urlin = urlout = `https://${s}`;
+                urlout = `https://${s}`;
                 continue;
             }
             // Decode base64
-            if ( step === '-base64' ) {
-                urlin = urlout = self.atob(urlin);
+            if ( c0 === 0x2D && step === '-base64' ) {
+                urlout = self.atob(urlin);
+                continue;
+            }
+            // Regex extraction from first capture group
+            if ( c0 === 0x2F ) { // /
+                if ( directive.cache === null ) {
+                    directive.cache = new RegExp(step.slice(1, -1));
+                }
+                const match = directive.cache.exec(urlin);
+                if ( match === null ) { return; }
+                if ( match.length <= 1 ) { return; }
+                urlout = match[1];
+                continue;
+            }
+            // Extract from URL parameter
+            if ( c0 === 0x3F ) { // ?
+                urlout = (new URL(urlin)).searchParams.get(step.slice(1));
+                if ( urlout === null ) { return; }
+                if ( urlout.includes(' ') ) {
+                    urlout = urlout.replace(/ /g, '%20');
+                }
                 continue;
             }
             // Unknown directive