Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Cert failure handling #525

Merged
merged 4 commits into from Apr 5, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 7 additions & 2 deletions app/models/system-state.server.ts
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,13 @@ import type { SystemState } from '@prisma/client';
*/

export function initialize() {
return prisma.systemState.create({
data: {
// Using an upsert here to make sure we only initialize if the unique row is missing
return prisma.systemState.upsert({
where: {
unique: StateEnumType.unique,
},
update: {},
create: {
unique: StateEnumType.unique,
reconciliationNeeded: true,
},
Expand Down
91 changes: 91 additions & 0 deletions app/queues/certificate/certificate-error-handler.server.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
import { Queue, QueueEvents } from 'bullmq';
import logger from '~/lib/logger.server';
import { CertificateStatus } from '@prisma/client';
import { addNotification } from '../notifications/notifications.server';
import * as certificateModel from '~/models/certificate.server';
import * as challengeModel from '~/models/challenge.server';
import { setIsReconciliationNeeded } from '~/models/system-state.server';
import { dnsCleanerQueueName } from './dns-cleaner-worker.server';

import { redis } from '~/lib/redis.server';

import type { CertificateJobData } from './certificateJobTypes.server';
import type { QueueEventsListener } from 'bullmq';

declare global {
var __cert_flow_error_handlers_init__: boolean;
}

/**
* When one step in the queue fails, it fails all subsequent steps as well.
* Because of this, we can attach a listener of type `failed` to the last one
* to do the cleanup
*
*
* Notes:
* Worker.error('failed', () => {}) seems to be never called
* Worker.on('failed', () => {}) is called on failures that are retried too
* FlowProducer.on('error', () => {}) does not give back job id/data
* For this reason, I have to use QueueEvents.
*
* To get the job data back from the QueueEvent, I have to treat the last
* step of the flow as the separate queue that it is, and instantiate a
* Queue object (only connecting to the redis queue, does nothing else)
*/

export const initCertificateErrorHandler = () => {
if (process.env.NODE_ENV !== 'production' && global.__cert_flow_error_handlers_init__) {
// Only do this setup once if in dev
return;
}

global.__cert_flow_error_handlers_init__ = true;

const dnsCleanerQueue = new Queue<CertificateJobData>(dnsCleanerQueueName, { connection: redis });
const dnsCleanerQueueEvents = new QueueEvents(dnsCleanerQueueName, { connection: redis });

const errorHandler: QueueEventsListener['failed'] = async ({ jobId }) => {
try {
const job = await dnsCleanerQueue.getJob(jobId);
const { rootDomain, certificateId } = job?.data ?? {};

if (!certificateId) {
return;
}

logger.info('Certificate flow failed. Running cleanup', { rootDomain, certificateId });

await certificateModel.updateCertificateById(certificateId, {
status: CertificateStatus.failed,
});

await challengeModel.deleteChallengesByCertificateId(certificateId);

/**
* No need to manually delete the dns records as
* the database relation is set to cascade - challenge delete
* will remove the row from the dnsRecords table as well
*
* After those records were cascade deleted, we can trigger
* the reconciler as well
*/

await setIsReconciliationNeeded(true);

const certificateEntry = await certificateModel.getCertificateById(certificateId);

logger.debug('Sending failure notification email', { rootDomain, certificateId });
await addNotification({
emailAddress: certificateEntry.user.email,
subject: 'My.Custom.Domain certificate request failed',
message: `${certificateEntry.user.displayName}, your certificate request with domain: ${certificateEntry.domain} has failed. Log in to My.Custom.Domain to try again.`,
});

logger.info('Cleanup completed on failed certificate flow', { rootDomain, certificateId });
} catch (e) {
logger.error('Certificate cleanup encountered an error', e);
}
};

dnsCleanerQueueEvents.on('failed', errorHandler);
};
8 changes: 6 additions & 2 deletions app/queues/certificate/certificate-flow.server.ts
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ import { redis } from '~/lib/redis.server';
import type { FlowJob } from 'bullmq';
import type { CertificateJobData } from './certificateJobTypes.server';
import { isUserDeactivated } from '~/models/user.server';
import { initCertificateErrorHandler } from './certificate-error-handler.server';

// Exporting these to allow for graceful shutdown
export {
Expand All @@ -36,6 +37,9 @@ const JOB_REMOVAL_INTERVAL_S = 7 * 24 * 60 * 60; // 7 days

const flowProducer = new FlowProducer({ connection: redis });

// Initialize error handling before any flows are dispatched
initCertificateErrorHandler();

export const addCertRequest = async ({ rootDomain, username }: AddCertRequest) => {
// Don't do anything is user is deactivated
if (await isUserDeactivated(username)) {
Expand Down Expand Up @@ -126,8 +130,8 @@ export const addCertRequest = async ({ rootDomain, username }: AddCertRequest) =
data: jobData,
children: [challengeCompleter],
opts: {
failParentOnFailure: false, // Important, don't wait the cleanup step
attempts: 3,
failParentOnFailure: true,
attempts: 5,
backoff: {
type: 'exponential',
delay: 60_000, // start with 1 minute, double each time
Expand Down
2 changes: 1 addition & 1 deletion app/queues/certificate/order-completer-worker.server.ts
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ export const orderCompleterWorker = new Worker<CertificateJobData>(
await addNotification({
emailAddress: certificateEntry.user.email,
subject: 'My.Custom.Domain certificate ready',
message: `${certificateEntry.username}, your certificate with domain: ${certificateEntry.domain} is ready. Log in to My.Custom.Domain to view/manage it.`,
message: `${certificateEntry.user.displayName}, your certificate with domain: ${certificateEntry.domain} is ready. Log in to My.Custom.Domain to view/manage it.`,
});
},
{ connection: redis }
Expand Down
4 changes: 2 additions & 2 deletions test/unit/system-state.server.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -22,13 +22,13 @@ describe('initialize()', () => {
expect(state.reconciliationNeeded).toBe(true);
});

test('only one system state can exist', async () => {
test('only one system state can exist, but initialize should not fail if it does', async () => {
await expect(
prisma.systemState.create({
data: {},
})
).rejects.toThrow();
await expect(initialize()).rejects.toThrow();
await expect(initialize()).resolves.not.toThrow();
});
});

Expand Down