Skip to content

Commit 234f8ea

Browse files
authored
chore: start Xetification (#215)
* chore: start Xetification * chore: need to fuzz Xet headers * chore: comment * chore: fmt * chore: move to repo I control * chore: simplify --------- Co-authored-by: FL33TW00D <[email protected]>
1 parent 010a298 commit 234f8ea

File tree

2 files changed

+108
-5
lines changed

2 files changed

+108
-5
lines changed

Sources/Hub/HubApi.swift

Lines changed: 96 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,53 @@ import Foundation
1010
import Network
1111
import os
1212

13+
/// https://datatracker.ietf.org/doc/html/rfc7540#section-8.1.2
14+
/// `requests` in Python leaves headers as their original casing,
15+
/// where as Swift strictly adheres to RFC 7540 and can force lower case.
16+
/// This is relevant for Xet
17+
enum HFHttpHeaders {
18+
static let location = "Location"
19+
static let etag = "Etag"
20+
static let contentLength = "Content-Length"
21+
static let repoCommit = "X-Repo-Commit"
22+
static let linkedEtag = "X-Linked-Etag"
23+
static let linkedSize = "X-Linked-Size"
24+
static let xetHash = "x-xet-hash"
25+
static let xetRefreshRoute = "X-Xet-Refresh-Route"
26+
static let linkXetAuthKey = "xet-auth"
27+
}
28+
29+
public struct XetFileData {
30+
let fileHash: String
31+
let refreshRoute: String
32+
}
33+
34+
/// `requests` automatically parses Link headers into `response.links`,
35+
/// we implement similar functionality here.
36+
extension HTTPURLResponse {
37+
func getLinkURL(for rel: String) -> String? {
38+
guard let linkHeader = allHeaderFields["Link"] as? String else {
39+
return nil
40+
}
41+
42+
for link in linkHeader.split(separator: ",") {
43+
let trimmed = link.trimmingCharacters(in: .whitespaces)
44+
45+
if trimmed.contains("rel=\"\(rel)\"") || trimmed.contains("rel=\(rel)") {
46+
if let start = trimmed.firstIndex(of: "<"),
47+
let end = trimmed.firstIndex(of: ">"),
48+
start < end
49+
{
50+
let startIndex = trimmed.index(after: start)
51+
return String(trimmed[startIndex..<end])
52+
}
53+
}
54+
}
55+
56+
return nil
57+
}
58+
}
59+
1360
public struct HubApi: Sendable {
1461
var downloadBase: URL
1562
var hfToken: String?
@@ -24,7 +71,7 @@ public struct HubApi: Sendable {
2471
public init(
2572
downloadBase: URL? = nil,
2673
hfToken: String? = nil,
27-
endpoint: String = "https://huggingface.co",
74+
endpoint: String? = nil,
2875
useBackgroundSession: Bool = false,
2976
useOfflineMode: Bool? = nil
3077
) {
@@ -35,7 +82,7 @@ public struct HubApi: Sendable {
3582
let documents = FileManager.default.urls(for: .documentDirectory, in: .userDomainMask).first!
3683
self.downloadBase = documents.appending(component: "huggingface")
3784
}
38-
self.endpoint = endpoint
85+
self.endpoint = endpoint ?? Self.hfEndpointfromEnv()
3986
self.useBackgroundSession = useBackgroundSession
4087
self.useOfflineMode = useOfflineMode
4188
NetworkMonitor.shared.startMonitoring()
@@ -50,6 +97,10 @@ public struct HubApi: Sendable {
5097
}
5198

5299
private extension HubApi {
100+
static func hfEndpointfromEnv() -> String {
101+
ProcessInfo.processInfo.environment["HF_ENDPOINT"] ?? "https://huggingface.co"
102+
}
103+
53104
static func hfTokenFromEnv() -> String? {
54105
let possibleTokens = [
55106
{ ProcessInfo.processInfo.environment["HF_TOKEN"] },
@@ -573,6 +624,9 @@ public extension HubApi {
573624

574625
/// Size of the file. In case of an LFS file, contains the size of the actual LFS file, not the pointer.
575626
public let size: Int?
627+
628+
/// Xet file data, if available. Contains the file hash and the refresh route.
629+
public let xetFileData: XetFileData?
576630
}
577631

578632
/// Metadata about a file in the local directory related to a download process
@@ -601,12 +655,49 @@ public extension HubApi {
601655
let location = response.statusCode == 302 ? response.value(forHTTPHeaderField: "Location") : response.url?.absoluteString
602656

603657
return FileMetadata(
604-
commitHash: response.value(forHTTPHeaderField: "X-Repo-Commit"),
658+
commitHash: response.value(forHTTPHeaderField: HFHttpHeaders.repoCommit),
605659
etag: normalizeEtag(
606-
(response.value(forHTTPHeaderField: "X-Linked-Etag")) ?? (response.value(forHTTPHeaderField: "Etag"))
660+
(response.value(forHTTPHeaderField: HFHttpHeaders.linkedEtag)) ?? (response.value(forHTTPHeaderField: HFHttpHeaders.etag))
607661
),
608662
location: location ?? url.absoluteString,
609-
size: Int(response.value(forHTTPHeaderField: "X-Linked-Size") ?? response.value(forHTTPHeaderField: "Content-Length") ?? "")
663+
size: Int(response.value(forHTTPHeaderField: HFHttpHeaders.linkedSize) ?? response.value(forHTTPHeaderField: HFHttpHeaders.contentLength) ?? ""),
664+
xetFileData: parseXetFileDataFromResponse(response: response, endpoint: endpoint)
665+
)
666+
}
667+
668+
/// https://github.com/huggingface/huggingface_hub/blob/b698915d6b582c72806ac3e91c43bfd8dde35856/src/huggingface_hub/utils/_xet.py#L29
669+
private func parseXetFileDataFromResponse(
670+
response: HTTPURLResponse?,
671+
endpoint: String? = nil
672+
) -> XetFileData? {
673+
guard let response else {
674+
return nil
675+
}
676+
677+
guard let fileHash = response.allHeaderFields[HFHttpHeaders.xetHash] as? String else {
678+
return nil
679+
}
680+
681+
guard var refreshRoute = response.getLinkURL(for: HFHttpHeaders.linkXetAuthKey)
682+
?? response.allHeaderFields[HFHttpHeaders.xetRefreshRoute] as? String
683+
else {
684+
return nil
685+
}
686+
687+
let endpoint = endpoint ?? "https://huggingface.co"
688+
689+
let defaultEndpoint = "https://huggingface.co"
690+
691+
if refreshRoute.hasPrefix(defaultEndpoint) {
692+
refreshRoute = refreshRoute.replacingOccurrences(
693+
of: defaultEndpoint.trimmingCharacters(in: CharacterSet(charactersIn: "/")),
694+
with: endpoint.trimmingCharacters(in: CharacterSet(charactersIn: "/"))
695+
)
696+
}
697+
698+
return XetFileData(
699+
fileHash: fileHash,
700+
refreshRoute: refreshRoute
610701
)
611702
}
612703

Tests/HubTests/HubApiTests.swift

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,18 @@ class HubApiTests: XCTestCase {
103103
}
104104
}
105105

106+
func testGetXetMetadata() async throws {
107+
do {
108+
let url = URL(string: "https://huggingface.co/FL33TW00D-HF/xet-test/resolve/main/tokenizer.json")
109+
let metadata = try await Hub.getFileMetadata(fileURL: url!)
110+
111+
XCTAssertNotNil(metadata.xetFileData)
112+
XCTAssertEqual(metadata.xetFileData?.fileHash, "6aec39639a0a2d1ca966356b8c2b8426a484f80ff80731f44fa8482040713bdf")
113+
} catch {
114+
XCTFail("\(error)")
115+
}
116+
}
117+
106118
func testGetFileMetadataBlobPath() async throws {
107119
do {
108120
let url = URL(string: "https://huggingface.co/enterprise-explorers/Llama-2-7b-chat-coreml/resolve/main/config.json")

0 commit comments

Comments
 (0)