Skip to content

Commit 2dda0fb

Browse files
committed
Add WARC format regression smoke test
Introduce TestSmokeWARCFormatRegression to validate WARC format consistency using a frozen reference file (testdata/test.warc.gz). This test checks exact byte counts, record counts, Content-Length values, and digest hashes against known-good values. This complements the existing dynamic tests by providing explicit validation that the WARC format hasn't changed, addressing the concern about byte-level format regression detection while keeping the main integration tests maintainable.
1 parent ca9fd64 commit 2dda0fb

File tree

1 file changed

+154
-0
lines changed

1 file changed

+154
-0
lines changed

smoke_test.go

Lines changed: 154 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,154 @@
1+
package warc
2+
3+
import (
4+
"io"
5+
"os"
6+
"strconv"
7+
"testing"
8+
)
9+
10+
// TestSmokeWARCFormatRegression validates that the WARC format remains consistent
11+
// by checking a frozen reference file (testdata/test.warc.gz) against known-good values.
12+
//
13+
// This test serves as a regression detector for WARC format changes, complementing the
14+
// dynamic tests in client_test.go. It addresses the concern that byte-level format
15+
// changes should be explicitly validated against a known-good snapshot.
16+
//
17+
// If this test fails, it indicates that either:
18+
// 1. The WARC writing logic has changed in a way that affects the format
19+
// 2. The reference file has been modified
20+
// 3. There's a bug in the record serialization
21+
func TestSmokeWARCFormatRegression(t *testing.T) {
22+
const testFile = "testdata/test.warc.gz"
23+
24+
// Expected file-level metrics
25+
const expectedFileSize = 22350 // bytes (compressed)
26+
const expectedTotalRecords = 3
27+
const expectedTotalContentLength = 22083 // sum of all Content-Length values
28+
29+
// Expected record-level metrics
30+
// These values were extracted from a known-good WARC file and serve as
31+
// a snapshot of correct format behavior.
32+
expectedRecords := []struct {
33+
warcType string
34+
contentLength int64
35+
blockDigest string
36+
payloadDigest string // only for response records
37+
targetURI string // only for response records
38+
}{
39+
{
40+
warcType: "warcinfo",
41+
contentLength: 143,
42+
blockDigest: "sha1:IYWIATZSPEOF7U5W7VGGJOSQTIWUDXQ6",
43+
},
44+
{
45+
warcType: "request",
46+
contentLength: 110,
47+
blockDigest: "sha1:JNDMG56JVTVVOQSDQRD25XWTGMRQAQDB",
48+
},
49+
{
50+
warcType: "response",
51+
contentLength: 21830,
52+
blockDigest: "sha1:LCKC4TTRSBWYHGYT5P22ON4DWY65WHDZ",
53+
targetURI: "https://apis.google.com/js/platform.js",
54+
},
55+
}
56+
57+
// Validate file size
58+
stat, err := os.Stat(testFile)
59+
if err != nil {
60+
t.Fatalf("failed to stat test file: %v", err)
61+
}
62+
if stat.Size() != expectedFileSize {
63+
t.Errorf("file size mismatch: expected %d bytes, got %d bytes", expectedFileSize, stat.Size())
64+
}
65+
66+
// Open and read WARC file
67+
file, err := os.Open(testFile)
68+
if err != nil {
69+
t.Fatalf("failed to open test file: %v", err)
70+
}
71+
defer file.Close()
72+
73+
reader, err := NewReader(file)
74+
if err != nil {
75+
t.Fatalf("failed to create WARC reader: %v", err)
76+
}
77+
78+
var recordCount int
79+
var totalContentLength int64
80+
81+
// Read and validate each record
82+
for recordCount < expectedTotalRecords {
83+
record, _, err := reader.ReadRecord()
84+
if err != nil {
85+
if err == io.EOF {
86+
break
87+
}
88+
t.Fatalf("failed to read record %d: %v", recordCount+1, err)
89+
}
90+
if record == nil {
91+
break
92+
}
93+
94+
expected := expectedRecords[recordCount]
95+
96+
// Validate WARC-Type
97+
warcType := record.Header.Get("WARC-Type")
98+
if warcType != expected.warcType {
99+
t.Errorf("record %d: WARC-Type mismatch: expected %q, got %q",
100+
recordCount+1, expected.warcType, warcType)
101+
}
102+
103+
// Validate Content-Length
104+
contentLengthStr := record.Header.Get("Content-Length")
105+
contentLength, err := strconv.ParseInt(contentLengthStr, 10, 64)
106+
if err != nil {
107+
t.Errorf("record %d: failed to parse Content-Length %q: %v",
108+
recordCount+1, contentLengthStr, err)
109+
} else {
110+
if contentLength != expected.contentLength {
111+
t.Errorf("record %d: Content-Length mismatch: expected %d, got %d",
112+
recordCount+1, expected.contentLength, contentLength)
113+
}
114+
totalContentLength += contentLength
115+
}
116+
117+
// Validate WARC-Block-Digest
118+
blockDigest := record.Header.Get("WARC-Block-Digest")
119+
if blockDigest != expected.blockDigest {
120+
t.Errorf("record %d: WARC-Block-Digest mismatch: expected %q, got %q",
121+
recordCount+1, expected.blockDigest, blockDigest)
122+
}
123+
124+
// Validate response-specific fields
125+
if warcType == "response" {
126+
if expected.targetURI != "" {
127+
targetURI := record.Header.Get("WARC-Target-URI")
128+
if targetURI != expected.targetURI {
129+
t.Errorf("record %d: WARC-Target-URI mismatch: expected %q, got %q",
130+
recordCount+1, expected.targetURI, targetURI)
131+
}
132+
}
133+
}
134+
135+
// Close record content
136+
if err := record.Content.Close(); err != nil {
137+
t.Errorf("record %d: failed to close content: %v", recordCount+1, err)
138+
}
139+
140+
recordCount++
141+
}
142+
143+
// Validate total record count
144+
if recordCount != expectedTotalRecords {
145+
t.Errorf("total record count mismatch: expected %d, got %d",
146+
expectedTotalRecords, recordCount)
147+
}
148+
149+
// Validate total content length
150+
if totalContentLength != expectedTotalContentLength {
151+
t.Errorf("total content length mismatch: expected %d bytes, got %d bytes",
152+
expectedTotalContentLength, totalContentLength)
153+
}
154+
}

0 commit comments

Comments
 (0)