Skip to content

Commit

Permalink
Support setting functions to be run before a new WARC file is created…
Browse files Browse the repository at this point in the history
… and after the file is completed
  • Loading branch information
johnerikhalse committed Feb 14, 2024
1 parent e1e3529 commit d80c8cf
Showing 1 changed file with 33 additions and 3 deletions.
36 changes: 33 additions & 3 deletions warcfile.go
Original file line number Diff line number Diff line change
Expand Up @@ -368,6 +368,11 @@ func (w *singleWarcFileWriter) createFile() error {
if path != "" && !strings.HasSuffix(path, "/") {
path += "/"
}

if w.opts.beforeFileCreationHook != nil {
_ = w.opts.beforeFileCreationHook(path + fileName)
}

path += fileName + w.opts.openFileSuffix

file, err := os.OpenFile(path, os.O_CREATE|os.O_EXCL|os.O_RDWR, 0666)
Expand Down Expand Up @@ -460,9 +465,14 @@ func (w *singleWarcFileWriter) close() error {
if err := f.Close(); err != nil {
return fmt.Errorf("failed to close file: %s: %w", f.Name(), err)
}
if err := fileutil.Rename(f.Name(), strings.TrimSuffix(f.Name(), w.opts.openFileSuffix)); err != nil {
finalFileName := strings.TrimSuffix(f.Name(), w.opts.openFileSuffix)
if err := fileutil.Rename(f.Name(), finalFileName); err != nil {
return fmt.Errorf("failed to rename file: %s: %w", f.Name(), err)
}

if w.opts.afterFileCreationHook != nil {
_ = w.opts.afterFileCreationHook(finalFileName, w.currentFileSize, w.currentWarcInfoId)
}
}
return nil
}
Expand Down Expand Up @@ -590,6 +600,8 @@ type warcFileWriterOptions struct {
warcInfoFunc func(recordBuilder WarcRecordBuilder) error
addConcurrentHeader bool
flush bool
beforeFileCreationHook func(fileName string) error
afterFileCreationHook func(fileName string, size int64, warcInfoId string) error
recordOptions []WarcRecordOption
}

Expand Down Expand Up @@ -724,7 +736,7 @@ func WithMarshaler(marshaler Marshaler) WarcFileWriterOption {
})
}

// WithMaxConcurrentWriters sets the maximum number of Warc files that can be written to simultaneously.
// WithMaxConcurrentWriters sets the maximum number of Warc files that can be written simultaneously.
//
// defaults to one
func WithMaxConcurrentWriters(count int) WarcFileWriterOption {
Expand All @@ -737,7 +749,7 @@ func WithMaxConcurrentWriters(count int) WarcFileWriterOption {
//
// This value is used to decide if a record will fit into a Warcfile's MaxFileSize when using compression
// since it's not possible to know this before the record is written. If the value is far from the actual size reduction,
// a under- or overfilled file might be the result.
// an under- or overfilled file might be the result.
//
// defaults to .5 (half the uncompressed size)
func WithExpectedCompressionRatio(ratio float64) WarcFileWriterOption {
Expand Down Expand Up @@ -781,3 +793,21 @@ func WithRecordOptions(opts ...WarcRecordOption) WarcFileWriterOption {
o.recordOptions = opts
})
}

// WithBeforeFileCreationHook sets a function to be called before a new file is created.
//
// The function receives the file name of the new file.
func WithBeforeFileCreationHook(f func(fileName string) error) WarcFileWriterOption {
return newFuncWarcFileOption(func(o *warcFileWriterOptions) {
o.beforeFileCreationHook = f
})
}

// WithAfterFileCreationHook sets a function to be called after a new file is created.
//
// The function receives the file name of the new file, the size of the file and the WARC-Warcinfo-ID.
func WithAfterFileCreationHook(f func(fileName string, size int64, warcInfoId string) error) WarcFileWriterOption {
return newFuncWarcFileOption(func(o *warcFileWriterOptions) {
o.afterFileCreationHook = f
})
}

0 comments on commit d80c8cf

Please sign in to comment.