|
| 1 | +package warc |
| 2 | + |
| 3 | +import ( |
| 4 | + "io" |
| 5 | + "os" |
| 6 | + "strconv" |
| 7 | + "testing" |
| 8 | +) |
| 9 | + |
| 10 | +// TestSmokeWARCFormatRegression validates that the WARC format remains consistent |
| 11 | +// by checking a frozen reference file (testdata/test.warc.gz) against known-good values. |
| 12 | +// |
| 13 | +// This test serves as a regression detector for WARC format changes, complementing the |
| 14 | +// dynamic tests in client_test.go. It addresses the concern that byte-level format |
| 15 | +// changes should be explicitly validated against a known-good snapshot. |
| 16 | +// |
| 17 | +// If this test fails, it indicates that either: |
| 18 | +// 1. The WARC writing logic has changed in a way that affects the format |
| 19 | +// 2. The reference file has been modified |
| 20 | +// 3. There's a bug in the record serialization |
| 21 | +func TestSmokeWARCFormatRegression(t *testing.T) { |
| 22 | + const testFile = "testdata/test.warc.gz" |
| 23 | + |
| 24 | + // Expected file-level metrics |
| 25 | + const expectedFileSize = 22350 // bytes (compressed) |
| 26 | + const expectedTotalRecords = 3 |
| 27 | + const expectedTotalContentLength = 22083 // sum of all Content-Length values |
| 28 | + |
| 29 | + // Expected record-level metrics |
| 30 | + // These values were extracted from a known-good WARC file and serve as |
| 31 | + // a snapshot of correct format behavior. |
| 32 | + expectedRecords := []struct { |
| 33 | + warcType string |
| 34 | + contentLength int64 |
| 35 | + blockDigest string |
| 36 | + payloadDigest string // only for response records |
| 37 | + targetURI string // only for response records |
| 38 | + }{ |
| 39 | + { |
| 40 | + warcType: "warcinfo", |
| 41 | + contentLength: 143, |
| 42 | + blockDigest: "sha1:IYWIATZSPEOF7U5W7VGGJOSQTIWUDXQ6", |
| 43 | + }, |
| 44 | + { |
| 45 | + warcType: "request", |
| 46 | + contentLength: 110, |
| 47 | + blockDigest: "sha1:JNDMG56JVTVVOQSDQRD25XWTGMRQAQDB", |
| 48 | + }, |
| 49 | + { |
| 50 | + warcType: "response", |
| 51 | + contentLength: 21830, |
| 52 | + blockDigest: "sha1:LCKC4TTRSBWYHGYT5P22ON4DWY65WHDZ", |
| 53 | + targetURI: "https://apis.google.com/js/platform.js", |
| 54 | + }, |
| 55 | + } |
| 56 | + |
| 57 | + // Validate file size |
| 58 | + stat, err := os.Stat(testFile) |
| 59 | + if err != nil { |
| 60 | + t.Fatalf("failed to stat test file: %v", err) |
| 61 | + } |
| 62 | + if stat.Size() != expectedFileSize { |
| 63 | + t.Errorf("file size mismatch: expected %d bytes, got %d bytes", expectedFileSize, stat.Size()) |
| 64 | + } |
| 65 | + |
| 66 | + // Open and read WARC file |
| 67 | + file, err := os.Open(testFile) |
| 68 | + if err != nil { |
| 69 | + t.Fatalf("failed to open test file: %v", err) |
| 70 | + } |
| 71 | + defer file.Close() |
| 72 | + |
| 73 | + reader, err := NewReader(file) |
| 74 | + if err != nil { |
| 75 | + t.Fatalf("failed to create WARC reader: %v", err) |
| 76 | + } |
| 77 | + |
| 78 | + var recordCount int |
| 79 | + var totalContentLength int64 |
| 80 | + |
| 81 | + // Read and validate each record |
| 82 | + for recordCount < expectedTotalRecords { |
| 83 | + record, _, err := reader.ReadRecord() |
| 84 | + if err != nil { |
| 85 | + if err == io.EOF { |
| 86 | + break |
| 87 | + } |
| 88 | + t.Fatalf("failed to read record %d: %v", recordCount+1, err) |
| 89 | + } |
| 90 | + if record == nil { |
| 91 | + break |
| 92 | + } |
| 93 | + |
| 94 | + expected := expectedRecords[recordCount] |
| 95 | + |
| 96 | + // Validate WARC-Type |
| 97 | + warcType := record.Header.Get("WARC-Type") |
| 98 | + if warcType != expected.warcType { |
| 99 | + t.Errorf("record %d: WARC-Type mismatch: expected %q, got %q", |
| 100 | + recordCount+1, expected.warcType, warcType) |
| 101 | + } |
| 102 | + |
| 103 | + // Validate Content-Length |
| 104 | + contentLengthStr := record.Header.Get("Content-Length") |
| 105 | + contentLength, err := strconv.ParseInt(contentLengthStr, 10, 64) |
| 106 | + if err != nil { |
| 107 | + t.Errorf("record %d: failed to parse Content-Length %q: %v", |
| 108 | + recordCount+1, contentLengthStr, err) |
| 109 | + } else { |
| 110 | + if contentLength != expected.contentLength { |
| 111 | + t.Errorf("record %d: Content-Length mismatch: expected %d, got %d", |
| 112 | + recordCount+1, expected.contentLength, contentLength) |
| 113 | + } |
| 114 | + totalContentLength += contentLength |
| 115 | + } |
| 116 | + |
| 117 | + // Validate WARC-Block-Digest |
| 118 | + blockDigest := record.Header.Get("WARC-Block-Digest") |
| 119 | + if blockDigest != expected.blockDigest { |
| 120 | + t.Errorf("record %d: WARC-Block-Digest mismatch: expected %q, got %q", |
| 121 | + recordCount+1, expected.blockDigest, blockDigest) |
| 122 | + } |
| 123 | + |
| 124 | + // Validate response-specific fields |
| 125 | + if warcType == "response" { |
| 126 | + if expected.targetURI != "" { |
| 127 | + targetURI := record.Header.Get("WARC-Target-URI") |
| 128 | + if targetURI != expected.targetURI { |
| 129 | + t.Errorf("record %d: WARC-Target-URI mismatch: expected %q, got %q", |
| 130 | + recordCount+1, expected.targetURI, targetURI) |
| 131 | + } |
| 132 | + } |
| 133 | + } |
| 134 | + |
| 135 | + // Close record content |
| 136 | + if err := record.Content.Close(); err != nil { |
| 137 | + t.Errorf("record %d: failed to close content: %v", recordCount+1, err) |
| 138 | + } |
| 139 | + |
| 140 | + recordCount++ |
| 141 | + } |
| 142 | + |
| 143 | + // Validate total record count |
| 144 | + if recordCount != expectedTotalRecords { |
| 145 | + t.Errorf("total record count mismatch: expected %d, got %d", |
| 146 | + expectedTotalRecords, recordCount) |
| 147 | + } |
| 148 | + |
| 149 | + // Validate total content length |
| 150 | + if totalContentLength != expectedTotalContentLength { |
| 151 | + t.Errorf("total content length mismatch: expected %d bytes, got %d bytes", |
| 152 | + expectedTotalContentLength, totalContentLength) |
| 153 | + } |
| 154 | +} |
0 commit comments