Skip to content

Commit

Permalink
loading: multi-wacz loading fix (#194)
Browse files Browse the repository at this point in the history
* loading: when loading from multi-wacz files, attempt load from all wacz files to find best match
* if no WACZ id present, check all WACZ files first, then check db (switch loading order)
  • Loading branch information
ikreymer authored Jul 16, 2024
1 parent 8187f5e commit 519b47b
Show file tree
Hide file tree
Showing 2 changed files with 31 additions and 15 deletions.
2 changes: 1 addition & 1 deletion dist/sw.js

Large diffs are not rendered by default.

44 changes: 30 additions & 14 deletions src/wacz/multiwacz.js
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import { OnDemandPayloadArchiveDB } from "../remotearchivedb.js";
import { SingleRecordWARCLoader } from "../warcloader.js";
import { CDXLoader, CDX_COOKIE } from "../cdxloader.js";
import { AccessDeniedError, digestMessage, handleAuthNeeded, tsToDate } from "../utils.js";
import { AccessDeniedError, digestMessage, handleAuthNeeded, tsToDate, getTS } from "../utils.js";
import { getSurt } from "warcio";
import { LiveProxy } from "../liveproxy.js";

Expand Down Expand Up @@ -535,18 +535,20 @@ export class MultiWACZ extends OnDemandPayloadArchiveDB// implements WACZLoadSou

async lookupUrl(url, datetime, opts = {}) {
try {
let result = await super.lookupUrl(url, datetime, opts);

if (result && (!opts.noRevisits || result.mime !== "warc/revisit")) {
return result;
}

const { waczname } = opts;

let result;

if (waczname && waczname !== NO_LOAD_WACZ) {
result = await this.lookupUrlForWACZ(waczname, url, datetime, opts);
}

if (result && (!opts.noRevisits || result.mime !== "warc/revisit")) {
return result;
}

result = await super.lookupUrl(url, datetime, opts);

return result;
} catch (e) {
console.warn(e);
Expand Down Expand Up @@ -775,7 +777,7 @@ export class MultiWACZ extends OnDemandPayloadArchiveDB// implements WACZLoadSou
}
}

let foundHash = null;
let foundMap = new Map();

for (const [name, file] of Object.entries(this.waczfiles)) {
if (file.fileType !== WACZ_LEAF) {
Expand All @@ -787,16 +789,30 @@ export class MultiWACZ extends OnDemandPayloadArchiveDB// implements WACZLoadSou
continue;
}

resp = await super.getResource(request, prefix, event, {waczname: name, noFuzzyCheck: true});
resp = await super.getResource(request, prefix, event, {waczname: name, noFuzzyCheck: true, loadFirst: true});
if (resp) {
waczname = name;
foundHash = file.hash;
break;
foundMap.set(resp.date, {name, hash: file.hash});
}
}

if (waczname) {
return Response.redirect(`${prefix}:${foundHash}/${request.timestamp}mp_/${request.url}`);
if (foundMap.size > 0) {
const requestTS = tsToDate(request.timestamp);
let min = -1;
let ts;
let foundHash;

for (const date of foundMap.keys()) {
const dist = Math.abs(date.getTime() - requestTS);
if (min < 0 || dist < min) {
const {name, hash} = foundMap.get(date);
waczname = name;
foundHash = hash;
ts = getTS(date.toISOString());
min = dist;
}
}

return Response.redirect(`${prefix}:${foundHash}/${ts}mp_/${request.url}`);
}

if (this.fuzzyUrlRules.length) {
Expand Down

0 comments on commit 519b47b

Please sign in to comment.