Skip to content

Commit

Permalink
feat: add signal and timeoutMillis options for SitemapRequestList
Browse files Browse the repository at this point in the history
  • Loading branch information
barjin committed Jun 26, 2024
1 parent 9e4a620 commit b1498a8
Show file tree
Hide file tree
Showing 2 changed files with 105 additions and 3 deletions.
49 changes: 46 additions & 3 deletions packages/core/src/storages/sitemap_request_list.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,26 @@ import { purgeDefaultStorages } from './utils';
import { Request } from '../request';

export interface SitemapRequestListOptions {
/**
* List of sitemap URLs to parse.
*/
sitemapUrls: string[];
/**
* Proxy URL to be used for sitemap loading.
*/
proxyUrl?: string;
/**
* Key for persisting the state of the request list in the `KeyValueStore`.
*/
persistStateKey?: string;
/**
* Abort signal to be used for sitemap loading.
*/
signal?: AbortSignal;
/**
* Timeout for sitemap loading in milliseconds. If both `signal` and `timeoutMillis` are provided, the first one to finish will abort the loading.
*/
timeoutMillis?: number;
}

interface SitemapParsingProgress {
Expand All @@ -23,6 +40,7 @@ interface SitemapRequestListState {
urlQueue: string[];
reclaimed: string[];
sitemapParsingProgress: Record<keyof SitemapParsingProgress, any>;
abortLoading: boolean;
}

/**
Expand Down Expand Up @@ -70,6 +88,16 @@ export class SitemapRequestList implements IRequestList {
*/
private urlQueue: string[] = [];

/**
* Indicates whether the request list sitemap loading was aborted.
*
* If the loading was aborted before the sitemaps were fully loaded, the request list might be missing some URLs.
* The `isSitemapFullyLoaded` method can be used to check if the sitemaps were fully loaded.
*
* If the loading is aborted and all the requests are handled, `isFinished()` will return `true`.
*/
private abortLoading: boolean = false;

/** Number of URLs that were marked as handled */
private handledUrlCount = 0;

Expand All @@ -95,6 +123,8 @@ export class SitemapRequestList implements IRequestList {
sitemapUrls: ow.array.ofType(ow.string),
proxyUrl: ow.optional.string,
persistStateKey: ow.optional.string,
signal: ow.optional.any(),
timeoutMillis: ow.optional.number,
}),
);

Expand All @@ -117,7 +147,7 @@ export class SitemapRequestList implements IRequestList {
* Resolves once all the sitemaps URLs have been fully loaded (sets `isSitemapFullyLoaded` to `true`).
*/
private async load(): Promise<void> {
while (!this.isSitemapFullyLoaded()) {
while (!this.isSitemapFullyLoaded() && !this.abortLoading) {
const sitemapUrl =
this.sitemapParsingProgress.inProgressUrl ??
this.sitemapParsingProgress.pendingUrls.values().next().value;
Expand Down Expand Up @@ -159,6 +189,17 @@ export class SitemapRequestList implements IRequestList {
const requestList = new SitemapRequestList(options);
await requestList.restoreState();
void requestList.load();

options?.signal?.addEventListener('abort', () => {
requestList.abortLoading = true;
});

if (options.timeoutMillis) {
setTimeout(() => {
requestList.abortLoading = true;
}, options.timeoutMillis);
}

return requestList;
}

Expand All @@ -181,7 +222,7 @@ export class SitemapRequestList implements IRequestList {
this.urlQueue.length === 0 &&
this.inProgress.size === 0 &&
this.reclaimed.size === 0 &&
this.isSitemapFullyLoaded()
(this.isSitemapFullyLoaded() || this.abortLoading)
);
}

Expand Down Expand Up @@ -217,6 +258,7 @@ export class SitemapRequestList implements IRequestList {
},
urlQueue: this.urlQueue,
reclaimed: [...this.inProgress, ...this.reclaimed], // In-progress and reclaimed requests will be both retried if state is restored
abortLoading: this.abortLoading,
} satisfies SitemapRequestListState);
}

Expand All @@ -241,6 +283,7 @@ export class SitemapRequestList implements IRequestList {
inProgressEntries: new Set(state.sitemapParsingProgress.inProgressEntries),
};
this.urlQueue = state.urlQueue;
this.abortLoading = state.abortLoading;

this.queuedUrlsBySitemap.clear();
}
Expand Down Expand Up @@ -272,7 +315,7 @@ export class SitemapRequestList implements IRequestList {
* @inheritDoc
*/
async *waitForNextRequest() {
while (!this.isSitemapFullyLoaded() || this.urlQueue.length > 0) {
while ((!this.isSitemapFullyLoaded() && !this.abortLoading) || this.urlQueue.length > 0) {
const request = await this.fetchNextRequest();
if (request) {
yield request;
Expand Down
59 changes: 59 additions & 0 deletions test/core/sitemap_request_list.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,65 @@ describe('SitemapRequestList', () => {
expect(list.handledCount()).toBe(7);
});

test('aborting long sitemap load works', async () => {
const controller = new AbortController();

const list = await SitemapRequestList.open({
sitemapUrls: [`${url}/sitemap-index.xml`],
signal: controller.signal,
});

await sleep(50); // Loads the first sub-sitemap, but not the second
controller.abort();

for await (const request of list.waitForNextRequest()) {
await list.markRequestHandled(request);
}

expect(list.isFinished()).resolves.toBe(true);
expect(list.isSitemapFullyLoaded()).toBe(false);
expect(list.handledCount()).toBe(2);
});

test('timeout option works', async () => {
const list = await SitemapRequestList.open({
sitemapUrls: [`${url}/sitemap-index.xml`],
timeoutMillis: 50, // Loads the first sub-sitemap, but not the second
});

for await (const request of list.waitForNextRequest()) {
await list.markRequestHandled(request);
}

expect(list.isFinished()).resolves.toBe(true);
expect(list.isSitemapFullyLoaded()).toBe(false);
expect(list.handledCount()).toBe(2);
});

test('resurrection does not resume aborted loading', async () => {
const options = {
sitemapUrls: [`${url}/sitemap-index.xml`],
persistStateKey: 'resurrection-abort',
timeoutMillis: 50,
};

{
const list = await SitemapRequestList.open(options);

await sleep(50);

expect(list.isEmpty()).resolves.toBe(false);
await list.persistState();
}

const newList = await SitemapRequestList.open(options);
for await (const request of newList.waitForNextRequest()) {
await newList.markRequestHandled(request);
}

expect(newList.handledCount()).toBe(2);
});

test('processing the whole list', async () => {
const list = await SitemapRequestList.open({ sitemapUrls: [`${url}/sitemap.xml`] });
const requests: Request[] = [];
Expand Down

0 comments on commit b1498a8

Please sign in to comment.