Mintplex-Labs
diff --git a/‎.github/workflows/dev-build.yaml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/dev-build.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎collector/.env.example‎
Lines changed: 5 additions & 0 deletions b/‎collector/.env.example‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎collector/.gitignore‎
Lines changed: 3 additions & 0 deletions b/‎collector/.gitignore‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎collector/__tests__/utils/extensions/YoutubeTranscript/YoutubeLoader/youtube-transcript.test.js‎
Lines changed: 27 additions & 11 deletions b/‎collector/__tests__/utils/extensions/YoutubeTranscript/YoutubeLoader/youtube-transcript.test.js‎
Lines changed: 27 additions & 11 deletions
diff --git a/‎collector/index.js‎
Lines changed: 12 additions & 0 deletions b/‎collector/index.js‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎collector/middleware/httpLogger.js‎
Lines changed: 29 additions & 0 deletions b/‎collector/middleware/httpLogger.js‎
Lines changed: 29 additions & 0 deletions
diff --git a/‎collector/utils/extensions/YoutubeTranscript/YoutubeLoader/youtube-transcript.js‎
Lines changed: 88 additions & 3 deletions b/‎collector/utils/extensions/YoutubeTranscript/YoutubeLoader/youtube-transcript.js‎
Lines changed: 88 additions & 3 deletions
diff --git a/‎docker/.env.example‎
Lines changed: 1 addition & 0 deletions b/‎docker/.env.example‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎frontend/src/components/LLMSelection/DPAISOptions/index.jsx‎
Lines changed: 1 addition & 1 deletion b/‎frontend/src/components/LLMSelection/DPAISOptions/index.jsx‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎frontend/src/components/Sidebar/SearchBox/index.jsx‎
Lines changed: 1 addition & 1 deletion b/‎frontend/src/components/Sidebar/SearchBox/index.jsx‎
Lines changed: 1 addition & 1 deletion
@@ -6,7 +6,7 @@ concurrency:
 
 on:
   push:
-    branches: ['3999-chromium-flags'] # put your current branch to create a build. Core team only.
+    branches: ['agentic-streaming'] # put your current branch to create a build. Core team only.
     paths-ignore:
       - '**.md'
       - 'cloud-deployments/*'
 
@@ -1 +1,6 @@
 # Placeholder .env file for collector runtime
+
+# This enables HTTP request/response logging in development. Set value to truthy string to enable, leave empty value or comment out to disable
+# ENABLE_HTTP_LOGGER=""
+# This enables timestamps for the HTTP Logger. Set value to true to enable, leave empty or comment out to disable
+# ENABLE_HTTP_LOGGER_TIMESTAMPS=""
@@ -4,3 +4,6 @@ yarn-error.log
 !yarn.lock
 outputs
 scripts
+.env.development
+.env.production
+.env.test
@@ -1,16 +1,32 @@
 const { YoutubeTranscript } = require("../../../../../utils/extensions/YoutubeTranscript/YoutubeLoader/youtube-transcript.js");
 
 describe("YoutubeTranscript", () => {
-  it("should fetch transcript from YouTube video", async () => {
-    const videoId = "BJjsfNO5JTo";
-    const transcript = await YoutubeTranscript.fetchTranscript(videoId, {
-      lang: "en",
-    });
+  if (process.env.GITHUB_ACTIONS) {
+    console.log("Skipping YoutubeTranscript test in GitHub Actions as the URLs will not resolve.");
+    it('is stubbed in GitHub Actions', () => expect(true).toBe(true));
+  } else {
+    it("should fetch transcript from YouTube video", async () => {
+      const videoId = "BJjsfNO5JTo";
+      const transcript = await YoutubeTranscript.fetchTranscript(videoId, {
+        lang: "en",
+      });
 
-    expect(transcript).toBeDefined();
-    expect(typeof transcript).toBe("string");
-    expect(transcript.length).toBeGreaterThan(0);
-    // console.log("Success! Transcript length:", transcript.length);
-    // console.log("First 200 characters:", transcript.substring(0, 200) + "...");
-  }, 30000);
+      expect(transcript).toBeDefined();
+      expect(typeof transcript).toBe("string");
+      expect(transcript.length).toBeGreaterThan(0);
+      console.log("First 200 characters:", transcript.substring(0, 200) + "...");
+    }, 30000);
+
+    it("should fetch non asr transcript from YouTube video", async () => {
+      const videoId = "D111ao6wWH0";
+      const transcript = await YoutubeTranscript.fetchTranscript(videoId, {
+        lang: "zh-HK",
+      });
+
+      expect(transcript).toBeDefined();
+      expect(typeof transcript).toBe("string");
+      expect(transcript.length).toBeGreaterThan(0);
+      console.log("First 200 characters:", transcript.substring(0, 200) + "...");
+    }, 30000);
+  }
 });
@@ -15,9 +15,21 @@ const { wipeCollectorStorage } = require("./utils/files");
 const extensions = require("./extensions");
 const { processRawText } = require("./processRawText");
 const { verifyPayloadIntegrity } = require("./middleware/verifyIntegrity");
+const { httpLogger } = require("./middleware/httpLogger");
 const app = express();
 const FILE_LIMIT = "3GB";
 
+// Only log HTTP requests in development mode and if the ENABLE_HTTP_LOGGER environment variable is set to true
+if (
+  process.env.NODE_ENV === "development" &&
+  !!process.env.ENABLE_HTTP_LOGGER
+) {
+  app.use(
+    httpLogger({
+      enableTimestamps: !!process.env.ENABLE_HTTP_LOGGER_TIMESTAMPS,
+    })
+  );
+}
 app.use(cors({ origin: true }));
 app.use(
   bodyParser.text({ limit: FILE_LIMIT }),
 
@@ -0,0 +1,29 @@
+const httpLogger =
+  ({ enableTimestamps = false }) =>
+  (req, res, next) => {
+    // Capture the original res.end to log response status
+    const originalEnd = res.end;
+
+    res.end = function (chunk, encoding) {
+      // Log the request method, status code, and path
+      const statusColor = res.statusCode >= 400 ? "\x1b[31m" : "\x1b[32m"; // Red for errors, green for success
+      console.log(
+        `\x1b[32m[HTTP]\x1b[0m ${statusColor}${res.statusCode}\x1b[0m ${
+          req.method
+        } -> ${req.path} ${
+          enableTimestamps
+            ? `@ ${new Date().toLocaleTimeString("en-US", { hour12: true })}`
+            : ""
+        }`.trim()
+      );
+
+      // Call the original end method
+      return originalEnd.call(this, chunk, encoding);
+    };
+
+    next();
+  };
+
+module.exports = {
+  httpLogger,
+};
@@ -85,6 +85,85 @@ class YoutubeTranscript {
       .replace(/\s+/g, " ");
   }
 
+  /**
+   * Calculates a preference score for a caption track to determine the best match
+   * @param {Object} track - The caption track object from YouTube
+   * @param {string} track.languageCode - ISO language code (e.g., 'zh-HK', 'en', 'es')
+   * @param {string} track.kind - Track type ('asr' for auto-generated, "" for human-transcribed)
+   * @param {string[]} preferredLanguages - Array of language codes in preference order (e.g., ['zh-HK', 'en'])
+   * @returns {number} Preference score (lower is better)
+   */
+  static #calculatePreferenceScore(track, preferredLanguages) {
+    // Language preference: index in preferredLanguages array (0 = most preferred)
+    const languagePreference = preferredLanguages.indexOf(track.languageCode);
+    const languageScore = languagePreference === -1 ? 9999 : languagePreference;
+
+    // Kind bonus: prefer human-transcribed (undefined) over auto-generated ('asr')
+    const kindBonus = track.kind === "asr" ? 0.5 : 0;
+
+    return languageScore + kindBonus;
+  }
+
+  /**
+   * Finds the most suitable caption track based on preferred languages
+   * @param {string} videoBody - The raw HTML response from YouTube
+   * @param {string[]} preferredLanguages - Array of language codes in preference order
+   * @returns {Object|null} The selected caption track or null if none found
+   */
+  static #findPreferredCaptionTrack(videoBody, preferredLanguages) {
+    const captionsConfigJson = videoBody.match(
+      /"captions":(.*?),"videoDetails":/s
+    );
+
+    const captionsConfig = captionsConfigJson?.[1]
+      ? JSON.parse(captionsConfigJson[1])
+      : null;
+
+    const captionTracks = captionsConfig
+      ? captionsConfig.playerCaptionsTracklistRenderer.captionTracks
+      : null;
+
+    if (!captionTracks || captionTracks.length === 0) {
+      return null;
+    }
+
+    const sortedTracks = [...captionTracks].sort((a, b) => {
+      const scoreA = this.#calculatePreferenceScore(a, preferredLanguages);
+      const scoreB = this.#calculatePreferenceScore(b, preferredLanguages);
+      return scoreA - scoreB;
+    });
+
+    return sortedTracks[0];
+  }
+
+  /**
+   * Fetches video page content and finds the preferred caption track
+   * @param {string} videoId - YouTube video ID
+   * @param {string[]} preferredLanguages - Array of preferred language codes
+   * @returns {Promise<Object>} The preferred caption track
+   * @throws {YoutubeTranscriptError} If no suitable caption track is found
+   */
+  static async #getPreferredCaptionTrack(videoId, preferredLanguages) {
+    const videoResponse = await fetch(
+      `https://www.youtube.com/watch?v=${videoId}`,
+      { credentials: "omit" }
+    );
+    const videoBody = await videoResponse.text();
+
+    const preferredCaptionTrack = this.#findPreferredCaptionTrack(
+      videoBody,
+      preferredLanguages
+    );
+
+    if (!preferredCaptionTrack) {
+      throw new YoutubeTranscriptError(
+        "No suitable caption track found for the video"
+      );
+    }
+
+    return preferredCaptionTrack;
+  }
+
   /**
    * Fetch transcript from YouTube video
    * @param {string} videoId - Video URL or video identifier
@@ -93,14 +172,20 @@ class YoutubeTranscript {
    * @returns {Promise<string>} Video transcript text
    */
   static async fetchTranscript(videoId, config = {}) {
+    const preferredLanguages = config?.lang ? [config?.lang, "en"] : ["en"];
     const identifier = this.retrieveVideoId(videoId);
-    const lang = config?.lang ?? "en";
 
     try {
+      const preferredCaptionTrack = await this.#getPreferredCaptionTrack(
+        identifier,
+        preferredLanguages
+      );
+
       const innerProto = this.#getBase64Protobuf({
-        param1: "asr",
-        param2: lang,
+        param1: preferredCaptionTrack.kind || "",
+        param2: preferredCaptionTrack.languageCode,
       });
+
       const params = this.#getBase64Protobuf({
         param1: identifier,
         param2: innerProto,
 
@@ -44,6 +44,7 @@ GID='1000'
 # OLLAMA_MODEL_PREF='llama2'
 # OLLAMA_MODEL_TOKEN_LIMIT=4096
 # OLLAMA_AUTH_TOKEN='your-ollama-auth-token-here (optional, only for ollama running behind auth - Bearer token)'
+# OLLAMA_RESPONSE_TIMEOUT=7200000 (optional, max timeout in milliseconds for ollama response to conclude. Default is 5min before aborting)
 
 # LLM_PROVIDER='togetherai'
 # TOGETHER_AI_API_KEY='my-together-ai-key'
 
@@ -92,7 +92,7 @@ export default function DellProAIStudioOptions({
               type="url"
               name="DellProAiStudioBasePath"
               className="border-none bg-theme-settings-input-bg text-white placeholder:text-theme-settings-input-placeholder text-sm rounded-lg focus:outline-primary-button active:outline-primary-button outline-none block w-full p-2.5"
-              placeholder="http://localhost:8553/v1"
+              placeholder="http://localhost:8553/v1/openai"
               value={basePathValue.value}
               required={true}
               autoComplete="off"
 
@@ -61,7 +61,7 @@ export default function SearchBox({ user, showNewWsModal }) {
           onChange={handleSearch}
           onReset={handleReset}
           onFocus={(e) => e.target.select()}
-          className="border-none w-full h-full rounded-lg bg-theme-sidebar-item-default pl-4 pr-1 placeholder:text-theme-settings-input-placeholder placeholder:pl-4 outline-none text-white search-input peer text-sm"
+          className="border-none w-full h-full rounded-lg bg-theme-sidebar-item-default pl-9 focus:pl-4 pr-1 placeholder:text-theme-settings-input-placeholder outline-none text-white search-input peer text-sm"
         />
         <MagnifyingGlass
           size={14}