Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 33 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -94,3 +94,36 @@ docs/apidoc/
node_modules

datadir

# === Crosslink managed (do not edit between markers) ===
# .crosslink/ — machine-local state (never commit)
.crosslink/issues.db
.crosslink/issues.db-wal
.crosslink/issues.db-shm
.crosslink/agent.json
.crosslink/session.json
.crosslink/daemon.pid
.crosslink/daemon.log
.crosslink/last_test_run
.crosslink/keys/
.crosslink/.hub-cache/
.crosslink/.knowledge-cache/
.crosslink/.cache/
.crosslink/hook-config.local.json
.crosslink/integrations/
.crosslink/rules.local/

# .crosslink/ — DO track these (project-level policy):
# .crosslink/hook-config.json — shared team configuration
# .crosslink/rules/ — project coding standards
# .crosslink/.gitignore — inner gitignore for agent files

# .claude/ — auto-generated by crosslink init (not project source)
.claude/hooks/
.claude/commands/
.claude/mcp/

# .claude/ — DO track these (if manually configured):
# .claude/settings.json — Claude Code project settings
# .claude/settings.local.json is per-developer, ignore separately if needed
# === End crosslink managed ===
38 changes: 16 additions & 22 deletions Extension/src/background/dns-instrument.ts
Original file line number Diff line number Diff line change
@@ -1,14 +1,11 @@
import { PendingResponse } from "../lib/pending-response";
import { DnsResolved } from "../schema";
import { allTypes } from "./http-instrument";
import { WebRequestOnHeadersReceivedDetails } from "../types/browser-web-request-event-details";
import RequestFilter = browser.webRequest.RequestFilter;

export class DnsInstrument {
private readonly dataReceiver;
private onCompleteListener;
private pendingResponses: {
[requestId: number]: PendingResponse;
} = {};
private onHeadersReceivedListener;

constructor(dataReceiver) {
this.dataReceiver = dataReceiver;
Expand All @@ -28,44 +25,41 @@ export class DnsInstrument {
/*
* Attach handlers to event listeners
*/
this.onCompleteListener = (
details: browser.webRequest._OnCompletedDetails,
this.onHeadersReceivedListener = (
details: WebRequestOnHeadersReceivedDetails,
) => {
// Ignore requests made by extensions
if (requestStemsFromExtension(details)) {
return;
}
const pendingResponse = this.getPendingResponse(details.requestId);
pendingResponse.resolveOnCompletedEventDetails(details);

this.onCompleteDnsHandler(details, crawlID);
this.onHeadersReceivedDnsHandler(details, crawlID);
};

browser.webRequest.onCompleted.addListener(this.onCompleteListener, filter);
browser.webRequest.onHeadersReceived.addListener(
this.onHeadersReceivedListener,
filter,
);
}

public cleanup() {
if (this.onCompleteListener) {
browser.webRequest.onCompleted.removeListener(this.onCompleteListener);
}
}

private getPendingResponse(requestId): PendingResponse {
if (!this.pendingResponses[requestId]) {
this.pendingResponses[requestId] = new PendingResponse();
if (this.onHeadersReceivedListener) {
browser.webRequest.onHeadersReceived.removeListener(
this.onHeadersReceivedListener,
);
}
return this.pendingResponses[requestId];
}

private async onCompleteDnsHandler(
details: browser.webRequest._OnCompletedDetails,
private async onHeadersReceivedDnsHandler(
details: WebRequestOnHeadersReceivedDetails,
crawlID,
) {
// Create and populate DnsResolve object
const dnsRecord = {} as DnsResolved;
dnsRecord.browser_id = crawlID;
dnsRecord.request_id = Number(details.requestId);
dnsRecord.used_address = details.ip;
dnsRecord.redirect_url = details.url;
const currentTime = new Date(details.timeStamp);
dnsRecord.time_stamp = currentTime.toISOString();

Expand Down
1 change: 1 addition & 0 deletions Extension/src/schema.ts
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,7 @@ export interface DnsResolved {
visit_id?: number;
hostname: string;
request_id: number;
redirect_url?: string;
time_stamp: DateTime;
addresses?: string;
used_address?: string;
Expand Down
17 changes: 11 additions & 6 deletions Extension/src/types/browser-web-request-event-details.ts
Original file line number Diff line number Diff line change
@@ -1,11 +1,5 @@
/* eslint-disable no-underscore-dangle */

/**
* This file contains selected implicit interfaces copied from node_modules/@types/firefox-webext-browser/index.d.ts
* Defined and exported here in order for our code to be able to reference them explicitly in helper functions
* and class methods that accept arguments of these types.
*/

export interface FrameAncestor {
/** The URL that the document was loaded from. */
url: string;
Expand All @@ -18,3 +12,14 @@ export interface WebRequestOnBeforeSendHeadersEventDetails
/** Contains information for each document in the frame hierarchy up to the top-level document. The first element in the array contains information about the immediate parent of the document being requested, and the last element contains information about the top-level document. If the load is actually for the top-level document, then this array is empty. */
frameAncestors: FrameAncestor[];
}

export interface WebRequestOnHeadersReceivedDetails
extends browser.webRequest._OnHeadersReceivedDetails {
/**
* The server IP address that the request was actually sent to.
* Present in Firefox's onHeadersReceived per MDN but not in @types/firefox-webext-browser.
* May be undefined if the IP has not yet been resolved (e.g. cached responses);
* undefined maps to NULL in dns_responses.used_address, which is valid signal.
*/
ip?: string;
}
2 changes: 2 additions & 0 deletions openwpm/storage/parquet_schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -237,7 +237,9 @@
pa.field("browser_id", pa.uint32(), nullable=False),
pa.field("visit_id", pa.int64(), nullable=False),
pa.field("hostname", pa.string()),
pa.field("redirect_url", pa.string()),
pa.field("addresses", pa.string()),
pa.field("used_address", pa.string()),
pa.field("canonical_name", pa.string()),
pa.field("is_TRR", pa.bool_()),
pa.field("time_stamp", pa.string(), nullable=False),
Expand Down
3 changes: 2 additions & 1 deletion openwpm/storage/schema.sql
Original file line number Diff line number Diff line change
Expand Up @@ -240,9 +240,10 @@ CREATE TABLE IF NOT EXISTS dns_responses (
browser_id INTEGER NOT NULL,
visit_id INTEGER NOT NULL,
hostname TEXT,
redirect_url TEXT,
addresses TEXT,
used_address TEXT,
canonical_name TEXT,
is_TRR INTEGER,
is_TRR INTEGER,
time_stamp DATETIME NOT NULL
);
2 changes: 2 additions & 0 deletions test/storage/test_values.py
Original file line number Diff line number Diff line change
Expand Up @@ -228,7 +228,9 @@ def random_word(length):
"browser_id": random.randint(0, 2**31 - 1),
"visit_id": random.randint(0, 2**63 - 1),
"hostname": random_word(12),
"redirect_url": random_word(12),
"addresses": random_word(12),
"used_address": random_word(12),
"canonical_name": random_word(12),
"is_TRR": random.choice([True, False]),
"time_stamp": random_word(12),
Expand Down
27 changes: 27 additions & 0 deletions test/test_dns_instrument.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,3 +19,30 @@ def test_name_resolution(default_params, task_manager_creator):
assert result["addresses"] == "127.0.0.1,::1"
assert result["hostname"] == "test.localhost"
assert result["canonical_name"] == "test.localhost"
assert result["redirect_url"] is not None
assert "test.localhost:8000" in result["redirect_url"]

# Each redirect hop should record the URL it was associated with
redirect_urls = [r["redirect_url"] for r in results]
assert all(url is not None for url in redirect_urls)
Comment on lines +22 to +27

Copilot AI Apr 5, 2026

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Because the test indexes into results[0] from an unfiltered SELECT * FROM dns_responses (no ORDER BY), the new behavior of logging on onHeadersReceived (redirect hops + potentially additional requests) can make row ordering nondeterministic and the assertions flaky. Consider filtering the query to the expected request/hostname and/or adding an explicit ORDER BY (e.g., id or time_stamp) before selecting the row(s) to assert against.

Copilot uses AI. Check for mistakes.


def test_dns_captured_on_connection_abort(default_params, task_manager_creator):
"""Regression test: DNS data must be captured even when the connection
aborts before completion. This verifies that the extension uses
onHeadersReceived (not onCompleted) to record DNS responses."""
manager_params, browser_params = default_params
for browser_param in browser_params:
browser_param.dns_instrument = True

manager, db = task_manager_creator((manager_params, browser_params))
manager.get("http://localhost:8000/CONNECTION_ABORT/")
manager.close()

results = db_utils.query_db(db, "SELECT * FROM dns_responses")
assert len(results) > 0, "No DNS responses captured for aborted connection"
result = results[0]
assert isinstance(result, Row)
assert result["used_address"] is not None
assert result["addresses"] is not None
assert result["hostname"] == "localhost"
9 changes: 9 additions & 0 deletions test/utilities.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,15 @@ def do_GET(self, *args, **kwargs):
self.end_headers()
return

# 2. Abort connection after sending partial response.
if self.path.startswith("/CONNECTION_ABORT/"):
self.send_response(200)
self.send_header("Content-Length", "99999")
self.end_headers()
self.wfile.write(b"partial")
self.wfile.close()
return

# Otherwise, return file from disk
return SimpleHTTPRequestHandler.do_GET(self, *args, **kwargs)

Expand Down
Loading