Skip to content

Commit e6819ea

Browse files
Merge branch 'master' into 4521-feat-use-system-prompt-for-agent
2 parents 72132b8 + 985527c commit e6819ea

File tree

15 files changed

+393
-140
lines changed

15 files changed

+393
-140
lines changed

collector/__tests__/utils/extensions/YoutubeTranscript/YoutubeLoader/youtube-transcript.test.js

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
process.env.STORAGE_DIR = "test-storage"; // needed for tests to run
12
const { YoutubeTranscript } = require("../../../../../utils/extensions/YoutubeTranscript/YoutubeLoader/youtube-transcript.js");
23

34
describe("YoutubeTranscript", () => {

collector/__tests__/utils/url/index.test.js

Lines changed: 69 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
1-
const { validURL, validateURL } = require("../../../utils/url");
1+
process.env.STORAGE_DIR = "test-storage"; // needed for tests to run
2+
const { validURL, validateURL, validYoutubeVideoUrl } = require("../../../utils/url");
23

34
// Mock the RuntimeSettings module
45
jest.mock("../../../utils/runtimeSettings", () => {
@@ -127,3 +128,70 @@ describe("validateURL", () => {
127128
.toBe("https://example.com/PATH/To/Resource?q2=Value&q1=UPPER");
128129
});
129130
});
131+
132+
133+
describe("validYoutubeVideoUrl", () => {
134+
const ID = "dQw4w9WgXcQ"; // 11-char valid video id
135+
136+
it("returns true for youtube watch URLs with v param", () => {
137+
expect(validYoutubeVideoUrl(`https://www.youtube.com/watch?v=${ID}`)).toBe(
138+
true
139+
);
140+
expect(validYoutubeVideoUrl(`https://youtube.com/watch?v=${ID}&t=10s`)).toBe(
141+
true
142+
);
143+
expect(validYoutubeVideoUrl(`https://m.youtube.com/watch?v=${ID}`)).toBe(true);
144+
expect(validYoutubeVideoUrl(`youtube.com/watch?v=${ID}`)).toBe(true);
145+
});
146+
147+
it("returns true for youtu.be short URLs", () => {
148+
expect(validYoutubeVideoUrl(`https://youtu.be/${ID}`)).toBe(true);
149+
expect(validYoutubeVideoUrl(`https://youtu.be/${ID}?si=abc`)).toBe(true);
150+
// extra path segments after id should still validate the id component
151+
expect(validYoutubeVideoUrl(`https://youtu.be/${ID}/extra`)).toBe(true);
152+
});
153+
154+
it("returns true for embed and v path formats", () => {
155+
expect(validYoutubeVideoUrl(`https://www.youtube.com/embed/${ID}`)).toBe(true);
156+
expect(validYoutubeVideoUrl(`https://youtube.com/v/${ID}`)).toBe(true);
157+
});
158+
159+
it("returns false for non-YouTube hosts", () => {
160+
expect(validYoutubeVideoUrl("https://example.com/watch?v=dQw4w9WgXcQ")).toBe(
161+
false
162+
);
163+
expect(validYoutubeVideoUrl("https://vimeo.com/123456")).toBe(false);
164+
});
165+
166+
it("returns false for unrelated YouTube paths without a video id", () => {
167+
expect(validYoutubeVideoUrl("https://www.youtube.com/user/somechannel")).toBe(
168+
false
169+
);
170+
expect(validYoutubeVideoUrl("https://www.youtube.com/")).toBe(false);
171+
});
172+
173+
it("returns false for empty or bad inputs", () => {
174+
expect(validYoutubeVideoUrl("")).toBe(false);
175+
expect(validYoutubeVideoUrl(null)).toBe(false);
176+
expect(validYoutubeVideoUrl(undefined)).toBe(false);
177+
});
178+
179+
it("returns the video ID for valid YouTube video URLs", () => {
180+
expect(validYoutubeVideoUrl(`https://www.youtube.com/watch?v=${ID}`, true)).toBe(ID);
181+
expect(validYoutubeVideoUrl(`https://youtube.com/watch?v=${ID}&t=10s`, true)).toBe(ID);
182+
expect(validYoutubeVideoUrl(`https://m.youtube.com/watch?v=${ID}`, true)).toBe(ID);
183+
expect(validYoutubeVideoUrl(`youtube.com/watch?v=${ID}`, true)).toBe(ID);
184+
expect(validYoutubeVideoUrl(`https://youtu.be/${ID}`, true)).toBe(ID);
185+
expect(validYoutubeVideoUrl(`https://youtu.be/${ID}?si=abc`, true)).toBe(ID);
186+
expect(validYoutubeVideoUrl(`https://youtu.be/${ID}/extra`, true)).toBe(ID);
187+
expect(validYoutubeVideoUrl(`https://www.youtube.com/embed/${ID}`, true)).toBe(ID);
188+
expect(validYoutubeVideoUrl(`https://youtube.com/v/${ID}`, true)).toBe(ID);
189+
// invalid video IDs
190+
expect(validYoutubeVideoUrl(`https://www.youtube.com/watch?v=invalid`, true)).toBe(null);
191+
expect(validYoutubeVideoUrl(`https://youtube.com/watch?v=invalid`, true)).toBe(null);
192+
expect(validYoutubeVideoUrl(`https://m.youtube.com/watch?v=invalid`, true)).toBe(null);
193+
expect(validYoutubeVideoUrl(`youtube.com/watch`, true)).toBe(null);
194+
expect(validYoutubeVideoUrl(`https://youtu.be/invalid`, true)).toBe(null);
195+
expect(validYoutubeVideoUrl(`https://youtu.be/invalid?si=abc`, true)).toBe(null);
196+
});
197+
});

collector/processLink/convert/generic.js

Lines changed: 23 additions & 79 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,18 @@
11
const { v4 } = require("uuid");
2-
const path = require("path");
32
const {
43
PuppeteerWebBaseLoader,
54
} = require("langchain/document_loaders/web/puppeteer");
65
const { writeToServerDocuments } = require("../../utils/files");
76
const { tokenizeString } = require("../../utils/tokenizer");
87
const { default: slugify } = require("slugify");
9-
const { getContentTypeFromURL, returnResult } = require("../helpers");
10-
const { processSingleFile } = require("../../processSingleFile");
11-
const { downloadURIToFile } = require("../../utils/downloadURIToFile");
12-
const { ACCEPTED_MIMES } = require("../../utils/constants");
8+
const {
9+
returnResult,
10+
determineContentType,
11+
processAsFile,
12+
} = require("../helpers");
13+
const {
14+
loadYouTubeTranscript,
15+
} = require("../../utils/extensions/YoutubeTranscript");
1316
const RuntimeSettings = require("../../utils/runtimeSettings");
1417

1518
/**
@@ -29,88 +32,30 @@ async function scrapeGenericUrl({
2932
metadata = {},
3033
saveAsDocument = true,
3134
}) {
32-
/** @type {'web' | 'file'} */
33-
let processVia = "web";
35+
/** @type {'web' | 'file' | 'youtube'} */
3436
console.log(`-- Working URL ${link} => (captureAs: ${captureAs}) --`);
35-
36-
const contentType = await getContentTypeFromURL(link)
37-
.then((result) => {
38-
// If there is a reason, log it, but continue with the process
39-
if (!!result.reason) console.error(result.reason);
40-
return result.contentType;
41-
})
42-
.catch((error) => {
43-
console.error("Error getting content type from URL", error);
44-
return null;
45-
});
46-
47-
// If the content is unlikely to be a webpage, assume it is a file and process it as a file
48-
if (
49-
!["text/html", "text/plain"].includes(contentType) &&
50-
contentType in ACCEPTED_MIMES
51-
)
52-
processVia = "file";
53-
37+
let { contentType, processVia } = await determineContentType(link);
5438
console.log(`-- URL determined to be ${contentType} (${processVia}) --`);
55-
// If the content type is a file, download the file to the hotdir and process it
56-
// Then return the content of the file as a document or whatever the captureAs dictates.
57-
if (processVia === "file") {
58-
const fileContentResult = await downloadURIToFile(link);
59-
if (!fileContentResult.success)
60-
return returnResult({
61-
success: false,
62-
reason: fileContentResult.reason,
63-
documents: [],
64-
content: null,
65-
saveAsDocument,
66-
});
67-
68-
const fileFilePath = fileContentResult.fileLocation;
69-
const targetFilename = path.basename(fileFilePath);
70-
71-
/**
72-
* If the saveAsDocument is false, we are only interested in the text content
73-
* and can ignore the file as a document by using `parseOnly` in the options.
74-
* This will send the file to the Direct Uploads folder instead of the Documents folder.
75-
* that will be deleted by the cleanup-orphan-documents job that runs frequently. The trade off
76-
* is that since it still is in FS we can debug its output or even potentially reuse it for other purposes.
77-
*
78-
* TODO: Improve this process via a new option that will instantly delete the file after processing
79-
* if we find we dont need this file ever after processing.
80-
*/
81-
const processSingleFileResult = await processSingleFile(targetFilename, {
82-
parseOnly: saveAsDocument === false,
83-
});
84-
if (!processSingleFileResult.success) {
85-
return returnResult({
86-
success: false,
87-
reason: processSingleFileResult.reason,
88-
documents: [],
89-
content: null,
90-
saveAsDocument,
91-
});
92-
}
93-
94-
// If we intend to return only the text content, return the content from the file
95-
// and then delete the file - otherwise it will be saved as a document
96-
if (!saveAsDocument) {
97-
return returnResult({
98-
success: true,
99-
content: processSingleFileResult.documents[0].pageContent,
100-
saveAsDocument,
101-
});
102-
}
10339

104-
return processSingleFileResult;
105-
}
40+
/**
41+
* When the content is a file or a YouTube video, we can use the existing processing functions
42+
* These are self-contained and will return the correct response based on the saveAsDocument flag already
43+
* so we can return the content immediately.
44+
*/
45+
if (processVia === "file")
46+
return await processAsFile({ uri: link, saveAsDocument });
47+
else if (processVia === "youtube")
48+
return await loadYouTubeTranscript(
49+
{ url: link },
50+
{ parseOnly: saveAsDocument === false }
51+
);
10652

10753
// Otherwise, assume the content is a webpage and scrape the content from the webpage
10854
const content = await getPageContent({
10955
link,
11056
captureAs,
11157
headers: scraperHeaders,
11258
});
113-
11459
if (!content || !content.length) {
11560
console.error(`Resulting URL content was empty at ${link}.`);
11661
return returnResult({
@@ -124,13 +69,12 @@ async function scrapeGenericUrl({
12469

12570
// If the captureAs is text, return the content as a string immediately
12671
// so that we dont save the content as a document
127-
if (!saveAsDocument) {
72+
if (!saveAsDocument)
12873
return returnResult({
12974
success: true,
13075
content,
13176
saveAsDocument,
13277
});
133-
}
13478

13579
// Save the content as a document from the URL
13680
const url = new URL(link);

collector/processLink/helpers/index.js

Lines changed: 107 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,9 @@
1+
const path = require("path");
12
const { validURL } = require("../../utils/url");
3+
const { processSingleFile } = require("../../processSingleFile");
4+
const { downloadURIToFile } = require("../../utils/downloadURIToFile");
5+
const { ACCEPTED_MIMES } = require("../../utils/constants");
6+
const { validYoutubeVideoUrl } = require("../../utils/url");
27

38
/**
49
* Get the content type of a resource
@@ -51,13 +56,23 @@ async function getContentTypeFromURL(url) {
5156
}
5257
}
5358

59+
/**
60+
* Normalize the result object based on the saveAsDocument flag
61+
* @param {Object} result - The result object to normalize
62+
* @param {boolean} result.success - Whether the result is successful
63+
* @param {string|null} result.reason - The reason for the result
64+
* @param {Object[]} result.documents - The documents from the result
65+
* @param {string|null} result.content - The content of the result
66+
* @param {boolean} result.saveAsDocument - Whether to save the content as a document. Default is true
67+
* @returns {{success: boolean, reason: string|null, documents: Object[], content: string|null}} - The normalized result object
68+
*/
5469
function returnResult({
5570
success,
5671
reason,
5772
documents,
5873
content,
5974
saveAsDocument = true,
60-
}) {
75+
} = {}) {
6176
if (!saveAsDocument) {
6277
return {
6378
success,
@@ -66,7 +81,98 @@ function returnResult({
6681
} else return { success, reason, documents };
6782
}
6883

84+
/**
85+
* Determine the content type of a link - should be a URL
86+
* @param {string} uri - The link to determine the content type of
87+
* @returns {Promise<{contentType: string|null, processVia: 'web' | 'file' | 'youtube'}>} - The content type of the link
88+
*/
89+
async function determineContentType(uri) {
90+
let processVia = "web";
91+
92+
// Dont check for content type if it is a YouTube video URL
93+
if (validYoutubeVideoUrl(uri))
94+
return { contentType: "text/html", processVia: "youtube" };
95+
96+
return await getContentTypeFromURL(uri)
97+
.then((result) => {
98+
if (!!result.reason) console.error(result.reason);
99+
100+
// If the content type is not text/html or text/plain, and it is in the ACCEPTED_MIMES,
101+
// then we can process it as a file
102+
if (
103+
!!result.contentType &&
104+
!["text/html", "text/plain"].includes(result.contentType) &&
105+
result.contentType in ACCEPTED_MIMES
106+
)
107+
processVia = "file";
108+
109+
return { contentType: result.contentType, processVia };
110+
})
111+
.catch((error) => {
112+
console.error("Error getting content type from URL", error);
113+
return { contentType: null, processVia };
114+
});
115+
}
116+
117+
/**
118+
* Process a link as a file
119+
* @param {string} uri - The link to process as a file
120+
* @param {boolean} saveAsDocument - Whether to save the content as a document. Default is true
121+
* @returns {Promise<{success: boolean, reason: string|null, documents: Object[], content: string|null, saveAsDocument: boolean}>} - The content of the file
122+
*/
123+
async function processAsFile({ uri, saveAsDocument = true }) {
124+
const fileContentResult = await downloadURIToFile(uri);
125+
if (!fileContentResult.success)
126+
return returnResult({
127+
success: false,
128+
reason: fileContentResult.reason,
129+
documents: [],
130+
content: null,
131+
saveAsDocument,
132+
});
133+
134+
const fileFilePath = fileContentResult.fileLocation;
135+
const targetFilename = path.basename(fileFilePath);
136+
137+
/**
138+
* If the saveAsDocument is false, we are only interested in the text content
139+
* and can ignore the file as a document by using `parseOnly` in the options.
140+
* This will send the file to the Direct Uploads folder instead of the Documents folder.
141+
* that will be deleted by the cleanup-orphan-documents job that runs frequently. The trade off
142+
* is that since it still is in FS we can debug its output or even potentially reuse it for other purposes.
143+
*
144+
* TODO: Improve this process via a new option that will instantly delete the file after processing
145+
* if we find we dont need this file ever after processing.
146+
*/
147+
const processSingleFileResult = await processSingleFile(targetFilename, {
148+
parseOnly: saveAsDocument === false,
149+
});
150+
if (!processSingleFileResult.success) {
151+
return returnResult({
152+
success: false,
153+
reason: processSingleFileResult.reason,
154+
documents: [],
155+
content: null,
156+
saveAsDocument,
157+
});
158+
}
159+
160+
// If we intend to return only the text content, return the content from the file
161+
// and then delete the file - otherwise it will be saved as a document
162+
if (!saveAsDocument) {
163+
return returnResult({
164+
success: true,
165+
content: processSingleFileResult.documents[0].pageContent,
166+
saveAsDocument,
167+
});
168+
}
169+
170+
return processSingleFileResult;
171+
}
172+
69173
module.exports = {
70174
returnResult,
71175
getContentTypeFromURL,
176+
determineContentType,
177+
processAsFile,
72178
};

collector/utils/extensions/YoutubeTranscript/YoutubeLoader/index.js

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
const { validYoutubeVideoUrl } = require("../../../url");
2+
13
/*
24
* This is just a custom implementation of the Langchain JS YouTubeLoader class
35
* as the dependency for YoutubeTranscript is quite fickle and its a rat race to keep it up
@@ -23,14 +25,9 @@ class YoutubeLoader {
2325
* @returns The videoId of the YouTube video.
2426
*/
2527
static getVideoID(url) {
26-
const match = url.match(
27-
/.*(?:youtu.be\/|v\/|u\/\w\/|embed\/|watch\?v=)([^#&?]*).*/
28-
);
29-
if (match !== null && match[1].length === 11) {
30-
return match[1];
31-
} else {
32-
throw new Error("Failed to get youtube video id from the url");
33-
}
28+
const videoId = validYoutubeVideoUrl(url, true);
29+
if (videoId) return videoId;
30+
throw new Error("Failed to get youtube video id from the url");
3431
}
3532

3633
/**

0 commit comments

Comments
 (0)