From 471005d9d60382dc8fdd8def29ac12a5ffe85b3e Mon Sep 17 00:00:00 2001 From: vringar Date: Sat, 4 Apr 2026 20:39:09 +0000 Subject: [PATCH 1/3] fix(dns-instrument): fix PendingResponse pollution and add redirect hop tracking [CL-6] - Remove onHeadersReceivedEventDetails from shared PendingResponse - Add redirect_url column to dns_responses schema - Rename onCompleteDnsHandler to onHeadersReceivedDnsHandler --- .gitignore | 33 ++++++++++++++++ Extension/src/background/dns-instrument.ts | 38 ++++++++----------- Extension/src/schema.ts | 1 + .../browser-web-request-event-details.ts | 17 ++++++--- openwpm/storage/schema.sql | 3 +- 5 files changed, 63 insertions(+), 29 deletions(-) diff --git a/.gitignore b/.gitignore index b9bf1801b7..1a23ed8c18 100644 --- a/.gitignore +++ b/.gitignore @@ -94,3 +94,36 @@ docs/apidoc/ node_modules datadir + +# === Crosslink managed (do not edit between markers) === +# .crosslink/ — machine-local state (never commit) +.crosslink/issues.db +.crosslink/issues.db-wal +.crosslink/issues.db-shm +.crosslink/agent.json +.crosslink/session.json +.crosslink/daemon.pid +.crosslink/daemon.log +.crosslink/last_test_run +.crosslink/keys/ +.crosslink/.hub-cache/ +.crosslink/.knowledge-cache/ +.crosslink/.cache/ +.crosslink/hook-config.local.json +.crosslink/integrations/ +.crosslink/rules.local/ + +# .crosslink/ — DO track these (project-level policy): +# .crosslink/hook-config.json — shared team configuration +# .crosslink/rules/ — project coding standards +# .crosslink/.gitignore — inner gitignore for agent files + +# .claude/ — auto-generated by crosslink init (not project source) +.claude/hooks/ +.claude/commands/ +.claude/mcp/ + +# .claude/ — DO track these (if manually configured): +# .claude/settings.json — Claude Code project settings +# .claude/settings.local.json is per-developer, ignore separately if needed +# === End crosslink managed === diff --git a/Extension/src/background/dns-instrument.ts b/Extension/src/background/dns-instrument.ts index 63aa5d6f6d..b140c93177 100755 --- a/Extension/src/background/dns-instrument.ts +++ b/Extension/src/background/dns-instrument.ts @@ -1,14 +1,11 @@ -import { PendingResponse } from "../lib/pending-response"; import { DnsResolved } from "../schema"; import { allTypes } from "./http-instrument"; +import { WebRequestOnHeadersReceivedDetails } from "../types/browser-web-request-event-details"; import RequestFilter = browser.webRequest.RequestFilter; export class DnsInstrument { private readonly dataReceiver; - private onCompleteListener; - private pendingResponses: { - [requestId: number]: PendingResponse; - } = {}; + private onHeadersReceivedListener; constructor(dataReceiver) { this.dataReceiver = dataReceiver; @@ -28,37 +25,33 @@ export class DnsInstrument { /* * Attach handlers to event listeners */ - this.onCompleteListener = ( - details: browser.webRequest._OnCompletedDetails, + this.onHeadersReceivedListener = ( + details: browser.webRequest._OnHeadersReceivedDetails, ) => { // Ignore requests made by extensions if (requestStemsFromExtension(details)) { return; } - const pendingResponse = this.getPendingResponse(details.requestId); - pendingResponse.resolveOnCompletedEventDetails(details); - this.onCompleteDnsHandler(details, crawlID); + this.onHeadersReceivedDnsHandler(details, crawlID); }; - browser.webRequest.onCompleted.addListener(this.onCompleteListener, filter); + browser.webRequest.onHeadersReceived.addListener( + this.onHeadersReceivedListener, + filter, + ); } public cleanup() { - if (this.onCompleteListener) { - browser.webRequest.onCompleted.removeListener(this.onCompleteListener); - } - } - - private getPendingResponse(requestId): PendingResponse { - if (!this.pendingResponses[requestId]) { - this.pendingResponses[requestId] = new PendingResponse(); + if (this.onHeadersReceivedListener) { + browser.webRequest.onHeadersReceived.removeListener( + this.onHeadersReceivedListener, + ); } - return this.pendingResponses[requestId]; } - private async onCompleteDnsHandler( - details: browser.webRequest._OnCompletedDetails, + private async onHeadersReceivedDnsHandler( + details: WebRequestOnHeadersReceivedDetails, crawlID, ) { // Create and populate DnsResolve object @@ -66,6 +59,7 @@ export class DnsInstrument { dnsRecord.browser_id = crawlID; dnsRecord.request_id = Number(details.requestId); dnsRecord.used_address = details.ip; + dnsRecord.redirect_url = details.url; const currentTime = new Date(details.timeStamp); dnsRecord.time_stamp = currentTime.toISOString(); diff --git a/Extension/src/schema.ts b/Extension/src/schema.ts index 842d373564..b88cc8193f 100644 --- a/Extension/src/schema.ts +++ b/Extension/src/schema.ts @@ -169,6 +169,7 @@ export interface DnsResolved { visit_id?: number; hostname: string; request_id: number; + redirect_url?: string; time_stamp: DateTime; addresses?: string; used_address?: string; diff --git a/Extension/src/types/browser-web-request-event-details.ts b/Extension/src/types/browser-web-request-event-details.ts index febaaf557d..7f2666274b 100644 --- a/Extension/src/types/browser-web-request-event-details.ts +++ b/Extension/src/types/browser-web-request-event-details.ts @@ -1,11 +1,5 @@ /* eslint-disable no-underscore-dangle */ -/** - * This file contains selected implicit interfaces copied from node_modules/@types/firefox-webext-browser/index.d.ts - * Defined and exported here in order for our code to be able to reference them explicitly in helper functions - * and class methods that accept arguments of these types. - */ - export interface FrameAncestor { /** The URL that the document was loaded from. */ url: string; @@ -18,3 +12,14 @@ export interface WebRequestOnBeforeSendHeadersEventDetails /** Contains information for each document in the frame hierarchy up to the top-level document. The first element in the array contains information about the immediate parent of the document being requested, and the last element contains information about the top-level document. If the load is actually for the top-level document, then this array is empty. */ frameAncestors: FrameAncestor[]; } + +export interface WebRequestOnHeadersReceivedDetails + extends browser.webRequest._OnHeadersReceivedDetails { + /** + * The server IP address that the request was actually sent to. + * Present in Firefox's onHeadersReceived per MDN but not in @types/firefox-webext-browser. + * May be undefined if the IP has not yet been resolved (e.g. cached responses); + * undefined maps to NULL in dns_responses.used_address, which is valid signal. + */ + ip?: string; +} diff --git a/openwpm/storage/schema.sql b/openwpm/storage/schema.sql index aab7126702..3dd52bb85f 100644 --- a/openwpm/storage/schema.sql +++ b/openwpm/storage/schema.sql @@ -240,9 +240,10 @@ CREATE TABLE IF NOT EXISTS dns_responses ( browser_id INTEGER NOT NULL, visit_id INTEGER NOT NULL, hostname TEXT, + redirect_url TEXT, addresses TEXT, used_address TEXT, canonical_name TEXT, - is_TRR INTEGER, + is_TRR INTEGER, time_stamp DATETIME NOT NULL ); From c83bc2378af73c69ebb45763dd19005ced07764f Mon Sep 17 00:00:00 2001 From: vringar Date: Sat, 4 Apr 2026 20:48:29 +0000 Subject: [PATCH 2/3] fix(dns-instrument): sync parquet schema and add redirect_url test coverage [CL-6] --- Extension/src/background/dns-instrument.ts | 2 +- openwpm/storage/parquet_schema.py | 2 ++ test/storage/test_values.py | 2 ++ test/test_dns_instrument.py | 6 ++++++ 4 files changed, 11 insertions(+), 1 deletion(-) diff --git a/Extension/src/background/dns-instrument.ts b/Extension/src/background/dns-instrument.ts index b140c93177..d073907312 100755 --- a/Extension/src/background/dns-instrument.ts +++ b/Extension/src/background/dns-instrument.ts @@ -26,7 +26,7 @@ export class DnsInstrument { * Attach handlers to event listeners */ this.onHeadersReceivedListener = ( - details: browser.webRequest._OnHeadersReceivedDetails, + details: WebRequestOnHeadersReceivedDetails, ) => { // Ignore requests made by extensions if (requestStemsFromExtension(details)) { diff --git a/openwpm/storage/parquet_schema.py b/openwpm/storage/parquet_schema.py index cf83f4406d..7033b69025 100644 --- a/openwpm/storage/parquet_schema.py +++ b/openwpm/storage/parquet_schema.py @@ -237,7 +237,9 @@ pa.field("browser_id", pa.uint32(), nullable=False), pa.field("visit_id", pa.int64(), nullable=False), pa.field("hostname", pa.string()), + pa.field("redirect_url", pa.string()), pa.field("addresses", pa.string()), + pa.field("used_address", pa.string()), pa.field("canonical_name", pa.string()), pa.field("is_TRR", pa.bool_()), pa.field("time_stamp", pa.string(), nullable=False), diff --git a/test/storage/test_values.py b/test/storage/test_values.py index b8bcaeb85e..9b58ba8ad3 100644 --- a/test/storage/test_values.py +++ b/test/storage/test_values.py @@ -228,7 +228,9 @@ def random_word(length): "browser_id": random.randint(0, 2**31 - 1), "visit_id": random.randint(0, 2**63 - 1), "hostname": random_word(12), + "redirect_url": random_word(12), "addresses": random_word(12), + "used_address": random_word(12), "canonical_name": random_word(12), "is_TRR": random.choice([True, False]), "time_stamp": random_word(12), diff --git a/test/test_dns_instrument.py b/test/test_dns_instrument.py index 34a537d47a..4a280bc51c 100644 --- a/test/test_dns_instrument.py +++ b/test/test_dns_instrument.py @@ -19,3 +19,9 @@ def test_name_resolution(default_params, task_manager_creator): assert result["addresses"] == "127.0.0.1,::1" assert result["hostname"] == "test.localhost" assert result["canonical_name"] == "test.localhost" + assert result["redirect_url"] is not None + assert "test.localhost:8000" in result["redirect_url"] + + # Each redirect hop should record the URL it was associated with + redirect_urls = [r["redirect_url"] for r in results] + assert all(url is not None for url in redirect_urls) From 087f858efd712db36b0bac6cf5a64a3fe48bf6bb Mon Sep 17 00:00:00 2001 From: vringar Date: Sun, 5 Apr 2026 17:51:36 +0000 Subject: [PATCH 3/3] test(dns): add regression test for DNS capture on connection abort Adds a /CONNECTION_ABORT/ handler to the test server that sends headers then forcibly closes the connection, and a test proving DNS data is still captured. This guards against reverting from onHeadersReceived to onCompleted, which would miss aborted requests. --- test/test_dns_instrument.py | 21 +++++++++++++++++++++ test/utilities.py | 9 +++++++++ 2 files changed, 30 insertions(+) diff --git a/test/test_dns_instrument.py b/test/test_dns_instrument.py index 4a280bc51c..05f8edb79e 100644 --- a/test/test_dns_instrument.py +++ b/test/test_dns_instrument.py @@ -25,3 +25,24 @@ def test_name_resolution(default_params, task_manager_creator): # Each redirect hop should record the URL it was associated with redirect_urls = [r["redirect_url"] for r in results] assert all(url is not None for url in redirect_urls) + + +def test_dns_captured_on_connection_abort(default_params, task_manager_creator): + """Regression test: DNS data must be captured even when the connection + aborts before completion. This verifies that the extension uses + onHeadersReceived (not onCompleted) to record DNS responses.""" + manager_params, browser_params = default_params + for browser_param in browser_params: + browser_param.dns_instrument = True + + manager, db = task_manager_creator((manager_params, browser_params)) + manager.get("http://localhost:8000/CONNECTION_ABORT/") + manager.close() + + results = db_utils.query_db(db, "SELECT * FROM dns_responses") + assert len(results) > 0, "No DNS responses captured for aborted connection" + result = results[0] + assert isinstance(result, Row) + assert result["used_address"] is not None + assert result["addresses"] is not None + assert result["hostname"] == "localhost" diff --git a/test/utilities.py b/test/utilities.py index c7ba7dafd5..e7de32f3ce 100644 --- a/test/utilities.py +++ b/test/utilities.py @@ -83,6 +83,15 @@ def do_GET(self, *args, **kwargs): self.end_headers() return + # 2. Abort connection after sending partial response. + if self.path.startswith("/CONNECTION_ABORT/"): + self.send_response(200) + self.send_header("Content-Length", "99999") + self.end_headers() + self.wfile.write(b"partial") + self.wfile.close() + return + # Otherwise, return file from disk return SimpleHTTPRequestHandler.do_GET(self, *args, **kwargs)