Skip to content

Commit 6872905

Browse files
Merge branch 'master' into 2110-download-file-as-document
2 parents 211ecd6 + 0043272 commit 6872905

File tree

94 files changed

+2461
-1380
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

94 files changed

+2461
-1380
lines changed

.github/workflows/dev-build.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ concurrency:
66

77
on:
88
push:
9-
branches: ['3999-chromium-flags'] # put your current branch to create a build. Core team only.
9+
branches: ['agentic-streaming'] # put your current branch to create a build. Core team only.
1010
paths-ignore:
1111
- '**.md'
1212
- 'cloud-deployments/*'

collector/.env.example

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1 +1,6 @@
11
# Placeholder .env file for collector runtime
2+
3+
# This enables HTTP request/response logging in development. Set value to truthy string to enable, leave empty value or comment out to disable
4+
# ENABLE_HTTP_LOGGER=""
5+
# This enables timestamps for the HTTP Logger. Set value to true to enable, leave empty or comment out to disable
6+
# ENABLE_HTTP_LOGGER_TIMESTAMPS=""

collector/.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,3 +4,6 @@ yarn-error.log
44
!yarn.lock
55
outputs
66
scripts
7+
.env.development
8+
.env.production
9+
.env.test
Lines changed: 27 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,16 +1,32 @@
11
const { YoutubeTranscript } = require("../../../../../utils/extensions/YoutubeTranscript/YoutubeLoader/youtube-transcript.js");
22

33
describe("YoutubeTranscript", () => {
4-
it("should fetch transcript from YouTube video", async () => {
5-
const videoId = "BJjsfNO5JTo";
6-
const transcript = await YoutubeTranscript.fetchTranscript(videoId, {
7-
lang: "en",
8-
});
4+
if (process.env.GITHUB_ACTIONS) {
5+
console.log("Skipping YoutubeTranscript test in GitHub Actions as the URLs will not resolve.");
6+
it('is stubbed in GitHub Actions', () => expect(true).toBe(true));
7+
} else {
8+
it("should fetch transcript from YouTube video", async () => {
9+
const videoId = "BJjsfNO5JTo";
10+
const transcript = await YoutubeTranscript.fetchTranscript(videoId, {
11+
lang: "en",
12+
});
913

10-
expect(transcript).toBeDefined();
11-
expect(typeof transcript).toBe("string");
12-
expect(transcript.length).toBeGreaterThan(0);
13-
// console.log("Success! Transcript length:", transcript.length);
14-
// console.log("First 200 characters:", transcript.substring(0, 200) + "...");
15-
}, 30000);
14+
expect(transcript).toBeDefined();
15+
expect(typeof transcript).toBe("string");
16+
expect(transcript.length).toBeGreaterThan(0);
17+
console.log("First 200 characters:", transcript.substring(0, 200) + "...");
18+
}, 30000);
19+
20+
it("should fetch non asr transcript from YouTube video", async () => {
21+
const videoId = "D111ao6wWH0";
22+
const transcript = await YoutubeTranscript.fetchTranscript(videoId, {
23+
lang: "zh-HK",
24+
});
25+
26+
expect(transcript).toBeDefined();
27+
expect(typeof transcript).toBe("string");
28+
expect(transcript.length).toBeGreaterThan(0);
29+
console.log("First 200 characters:", transcript.substring(0, 200) + "...");
30+
}, 30000);
31+
}
1632
});

collector/index.js

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,9 +15,21 @@ const { wipeCollectorStorage } = require("./utils/files");
1515
const extensions = require("./extensions");
1616
const { processRawText } = require("./processRawText");
1717
const { verifyPayloadIntegrity } = require("./middleware/verifyIntegrity");
18+
const { httpLogger } = require("./middleware/httpLogger");
1819
const app = express();
1920
const FILE_LIMIT = "3GB";
2021

22+
// Only log HTTP requests in development mode and if the ENABLE_HTTP_LOGGER environment variable is set to true
23+
if (
24+
process.env.NODE_ENV === "development" &&
25+
!!process.env.ENABLE_HTTP_LOGGER
26+
) {
27+
app.use(
28+
httpLogger({
29+
enableTimestamps: !!process.env.ENABLE_HTTP_LOGGER_TIMESTAMPS,
30+
})
31+
);
32+
}
2133
app.use(cors({ origin: true }));
2234
app.use(
2335
bodyParser.text({ limit: FILE_LIMIT }),

collector/middleware/httpLogger.js

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
const httpLogger =
2+
({ enableTimestamps = false }) =>
3+
(req, res, next) => {
4+
// Capture the original res.end to log response status
5+
const originalEnd = res.end;
6+
7+
res.end = function (chunk, encoding) {
8+
// Log the request method, status code, and path
9+
const statusColor = res.statusCode >= 400 ? "\x1b[31m" : "\x1b[32m"; // Red for errors, green for success
10+
console.log(
11+
`\x1b[32m[HTTP]\x1b[0m ${statusColor}${res.statusCode}\x1b[0m ${
12+
req.method
13+
} -> ${req.path} ${
14+
enableTimestamps
15+
? `@ ${new Date().toLocaleTimeString("en-US", { hour12: true })}`
16+
: ""
17+
}`.trim()
18+
);
19+
20+
// Call the original end method
21+
return originalEnd.call(this, chunk, encoding);
22+
};
23+
24+
next();
25+
};
26+
27+
module.exports = {
28+
httpLogger,
29+
};

collector/utils/extensions/YoutubeTranscript/YoutubeLoader/youtube-transcript.js

Lines changed: 88 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,85 @@ class YoutubeTranscript {
8585
.replace(/\s+/g, " ");
8686
}
8787

88+
/**
89+
* Calculates a preference score for a caption track to determine the best match
90+
* @param {Object} track - The caption track object from YouTube
91+
* @param {string} track.languageCode - ISO language code (e.g., 'zh-HK', 'en', 'es')
92+
* @param {string} track.kind - Track type ('asr' for auto-generated, "" for human-transcribed)
93+
* @param {string[]} preferredLanguages - Array of language codes in preference order (e.g., ['zh-HK', 'en'])
94+
* @returns {number} Preference score (lower is better)
95+
*/
96+
static #calculatePreferenceScore(track, preferredLanguages) {
97+
// Language preference: index in preferredLanguages array (0 = most preferred)
98+
const languagePreference = preferredLanguages.indexOf(track.languageCode);
99+
const languageScore = languagePreference === -1 ? 9999 : languagePreference;
100+
101+
// Kind bonus: prefer human-transcribed (undefined) over auto-generated ('asr')
102+
const kindBonus = track.kind === "asr" ? 0.5 : 0;
103+
104+
return languageScore + kindBonus;
105+
}
106+
107+
/**
108+
* Finds the most suitable caption track based on preferred languages
109+
* @param {string} videoBody - The raw HTML response from YouTube
110+
* @param {string[]} preferredLanguages - Array of language codes in preference order
111+
* @returns {Object|null} The selected caption track or null if none found
112+
*/
113+
static #findPreferredCaptionTrack(videoBody, preferredLanguages) {
114+
const captionsConfigJson = videoBody.match(
115+
/"captions":(.*?),"videoDetails":/s
116+
);
117+
118+
const captionsConfig = captionsConfigJson?.[1]
119+
? JSON.parse(captionsConfigJson[1])
120+
: null;
121+
122+
const captionTracks = captionsConfig
123+
? captionsConfig.playerCaptionsTracklistRenderer.captionTracks
124+
: null;
125+
126+
if (!captionTracks || captionTracks.length === 0) {
127+
return null;
128+
}
129+
130+
const sortedTracks = [...captionTracks].sort((a, b) => {
131+
const scoreA = this.#calculatePreferenceScore(a, preferredLanguages);
132+
const scoreB = this.#calculatePreferenceScore(b, preferredLanguages);
133+
return scoreA - scoreB;
134+
});
135+
136+
return sortedTracks[0];
137+
}
138+
139+
/**
140+
* Fetches video page content and finds the preferred caption track
141+
* @param {string} videoId - YouTube video ID
142+
* @param {string[]} preferredLanguages - Array of preferred language codes
143+
* @returns {Promise<Object>} The preferred caption track
144+
* @throws {YoutubeTranscriptError} If no suitable caption track is found
145+
*/
146+
static async #getPreferredCaptionTrack(videoId, preferredLanguages) {
147+
const videoResponse = await fetch(
148+
`https://www.youtube.com/watch?v=${videoId}`,
149+
{ credentials: "omit" }
150+
);
151+
const videoBody = await videoResponse.text();
152+
153+
const preferredCaptionTrack = this.#findPreferredCaptionTrack(
154+
videoBody,
155+
preferredLanguages
156+
);
157+
158+
if (!preferredCaptionTrack) {
159+
throw new YoutubeTranscriptError(
160+
"No suitable caption track found for the video"
161+
);
162+
}
163+
164+
return preferredCaptionTrack;
165+
}
166+
88167
/**
89168
* Fetch transcript from YouTube video
90169
* @param {string} videoId - Video URL or video identifier
@@ -93,14 +172,20 @@ class YoutubeTranscript {
93172
* @returns {Promise<string>} Video transcript text
94173
*/
95174
static async fetchTranscript(videoId, config = {}) {
175+
const preferredLanguages = config?.lang ? [config?.lang, "en"] : ["en"];
96176
const identifier = this.retrieveVideoId(videoId);
97-
const lang = config?.lang ?? "en";
98177

99178
try {
179+
const preferredCaptionTrack = await this.#getPreferredCaptionTrack(
180+
identifier,
181+
preferredLanguages
182+
);
183+
100184
const innerProto = this.#getBase64Protobuf({
101-
param1: "asr",
102-
param2: lang,
185+
param1: preferredCaptionTrack.kind || "",
186+
param2: preferredCaptionTrack.languageCode,
103187
});
188+
104189
const params = this.#getBase64Protobuf({
105190
param1: identifier,
106191
param2: innerProto,

docker/.env.example

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ GID='1000'
4444
# OLLAMA_MODEL_PREF='llama2'
4545
# OLLAMA_MODEL_TOKEN_LIMIT=4096
4646
# OLLAMA_AUTH_TOKEN='your-ollama-auth-token-here (optional, only for ollama running behind auth - Bearer token)'
47+
# OLLAMA_RESPONSE_TIMEOUT=7200000 (optional, max timeout in milliseconds for ollama response to conclude. Default is 5min before aborting)
4748

4849
# LLM_PROVIDER='togetherai'
4950
# TOGETHER_AI_API_KEY='my-together-ai-key'

frontend/src/components/LLMSelection/DPAISOptions/index.jsx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -92,7 +92,7 @@ export default function DellProAIStudioOptions({
9292
type="url"
9393
name="DellProAiStudioBasePath"
9494
className="border-none bg-theme-settings-input-bg text-white placeholder:text-theme-settings-input-placeholder text-sm rounded-lg focus:outline-primary-button active:outline-primary-button outline-none block w-full p-2.5"
95-
placeholder="http://localhost:8553/v1"
95+
placeholder="http://localhost:8553/v1/openai"
9696
value={basePathValue.value}
9797
required={true}
9898
autoComplete="off"

frontend/src/components/Sidebar/SearchBox/index.jsx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ export default function SearchBox({ user, showNewWsModal }) {
6161
onChange={handleSearch}
6262
onReset={handleReset}
6363
onFocus={(e) => e.target.select()}
64-
className="border-none w-full h-full rounded-lg bg-theme-sidebar-item-default pl-4 pr-1 placeholder:text-theme-settings-input-placeholder placeholder:pl-4 outline-none text-white search-input peer text-sm"
64+
className="border-none w-full h-full rounded-lg bg-theme-sidebar-item-default pl-9 focus:pl-4 pr-1 placeholder:text-theme-settings-input-placeholder outline-none text-white search-input peer text-sm"
6565
/>
6666
<MagnifyingGlass
6767
size={14}

0 commit comments

Comments
 (0)