diff --git a/packages/utils/src/internals/open_graph_parser.ts b/packages/utils/src/internals/open_graph_parser.ts
index ad6d8a371cf0..92da5c732f41 100644
--- a/packages/utils/src/internals/open_graph_parser.ts
+++ b/packages/utils/src/internals/open_graph_parser.ts
@@ -2,13 +2,489 @@ import type { Dictionary } from '@crawlee/types';
import type { CheerioAPI } from 'cheerio';
import { load } from 'cheerio';
+// TODO: Finish specializing this module, removing generalization as you go.
+
+/**
+ * To turn your web pages into graph objects, you need to add basic metadata to your page. We've based the initial version
+ * of the protocol on RDFa which means that you'll place additional tags in the
of your web page. The four
+ * required properties for every page are:
+ *
+ * - `og:url` - The canonical URL of your object that will be used as its permanent ID in the graph, e.g., "https://www.imdb.com/title/tt0117500/".
+ * - `og:title` - The title of your object as it should appear within the graph, e.g., "The Rock".
+ * - `og:type` - The type of your object, e.g., "video.movie". Depending on the type you specify, other properties may also be required.
+ * - `og:image` - An image URL which should represent your object within the graph.
+ *
+ * See more at https://ogp.me/. Turtle specification available at https://ogp.me/ns/ogp.me.ttl.
+ */
+export interface OpenGraphMetadata {
+ /**
+ * `og:url` - The canonical URL of your object that will be used as its permanent ID in the graph, e.g., "https://www.imdb.com/title/tt0117500/".
+ */
+ url?: string;
+ /**
+ * `og:type` - The type of your object, e.g., "video.movie". Depending on the type you specify, other properties may also be required.
+ */
+ type?: string;
+ /**
+ * `og:title` - The title of your object as it should appear within the graph, e.g., "The Rock".
+ */
+ title?: string;
+ /**
+ * `og:locale` - The locale these tags are marked up in. Of the format language_TERRITORY. Default is en_US.
+ */
+ locale?: string;
+ /**
+ * `og:locale:alternate` - An array of other locales this page is available in.
+ **/
+ localeAlternate?: string[]; // because you can't subclass a string
+ /**
+ * `og:image` - An image URL which should represent your object within the graph.
+ */
+ image?: OpenGraphImageMetadata | string | OpenGraphImageMetadata[] | string[] | [OpenGraphImageMetadata | string];
+ /**
+ * `og:video` - A URL to a video file that complements this object.
+ **/
+ video?: OpenGraphVideoMetadata | string | OpenGraphVideoMetadata[] | string[] | [OpenGraphVideoMetadata | string];
+ /**
+ * `og:audio` - A URL to an audio file to accompany this object.
+ **/
+ audio?: OpenGraphAudioMetadata | string | OpenGraphAudioMetadata[] | string[] | [OpenGraphAudioMetadata | string];
+ /**
+ * `og:description` - A one to two sentence description of your object.
+ **/
+ description?: string;
+ /**
+ * `og:site_name` - If your object is part of a larger web site, the name which should be displayed for the overall site. e.g., "IMDb".
+ **/
+ siteName?: string;
+ /**
+ * `og:determiner` - The word that appears before this object's title in a sentence. An enum of (a, an, the, "", auto). If auto is chosen, the
+ * consumer of your data should chose between "a" or "an". Default is "" (blank).
+ **/
+ determiner?: string;
+ /**
+ * @deprecated `og:geo` - The latitude and longitude of a resource.
+ */
+ geo?: OpenGraphGeoMetadata;
+}
+
+/**
+ * The `OpenGraphImageMetadata` class has some optional structured properties.
+ */
+export interface OpenGraphImageMetadata {
+ /**
+ * `og:image` - An image URL which should represent your object within the graph.
+ */
+ url?: string;
+ /**
+ * `og:image:secure_url` - An alternate url to use if the webpage requires HTTPS.
+ */
+ secureUrl?: string;
+ /**
+ * `og:image:type` - A MIME type for this image.
+ */
+ type?: string;
+ /**
+ * `og:image:width` - The number of pixels wide.
+ */
+ width?: number;
+ /**
+ * `og:image:height` - The number of pixels high.
+ */
+ height?: number;
+ /**
+ * `og:image:alt` - A description of what is in the image (not a caption). If the page specifies an og:image it should specify `og:image:alt`.
+ */
+ alt?: string;
+}
+
+/**
+ * The `OpenGraphVideoMetadata` class has some optional structured properties.
+ */
+export interface OpenGraphVideoMetadata {
+ /**
+ * `og:video` - A relevant video URL for your object.
+ */
+ url?: string;
+ /**
+ * `og:video:secure_url` - A relevant, secure video URL for your object.
+ */
+ secureUrl?: string;
+ /**
+ * `og:video:type` - The mime type of a video e.g., "application/x-shockwave-flash".
+ */
+ type?: string;
+ /**
+ * `og:video:width` - The width of a video.
+ */
+ width?: number;
+ /**
+ * `og:video:height` - The height of a video.
+ */
+ height?: number;
+}
+
+/**
+ * The `OpenGraphAudioMetadata` class has some optional structured properties.
+ */
+export interface OpenGraphAudioMetadata {
+ /**
+ * `og:audio` - A relevant audio URL for your object.
+ */
+ url?: string;
+ /**
+ * `og:audio:secure_url` - A relevant, secure audio URL for your object.
+ */
+ secureUrl?: string;
+ /**
+ * `og:audio:type` - The mime type of an audio file e.g., "application/mp3".
+ */
+ type?: string;
+ /**
+ * @deprecated `og:audio:A title for some audio.
+ */
+ title?: string;
+ /**
+ * @deprecated An artist of some audio.
+ */
+ artist?: string;
+ /**
+ * @deprecated An album to which some audio belongs.
+ */
+ album?: string;
+}
+
+/**
+ * @deprecated The `OpenGraphGeoMetadata` class stores the latitude and longitude for a resource.
+ */
+export interface OpenGraphGeoMetadata {
+ /**
+ * @deprecated `og:geo:lat` - The latitude of the resource e.g., the latitude of a company.
+ */
+ latitude?: number;
+ /**
+ * @deprecated `og:geo:long` - The longitude of the resource e.g., the longitude of a company.
+ */
+ longitude?: number;
+}
+
export interface OpenGraphProperty {
name: string;
outputName: string;
children: OpenGraphProperty[];
+ // This may be useful in figuring out whether or not to treat it like an array (or an array of dictionaries)
+ // cardinality?: Number;
}
-type OpenGraphResult = string | string[] | Dictionary;
+export type OpenGraphResult = string | number | string[] | Dictionary;
+
+/**
+ * An OpenGraphParseHandler is triggered during parsing when the label and contents are read from the META tags.
+ * You can use a custom handler, or if you don't specify one it will by default copy the content read from the tag
+ * as a string. You may use a custom type.
+ */
+export type OpenGraphParseHandler = (content: string) => R;
+
+/**
+ * This is the default behavior for parsing the data. It copies the content to the return value.
+ * @param _label Unused in this implementation, the label for the Open Graph property.
+ * @param content The value for the Open Graph property.
+ * @returns The string value of the Open Graph content property.
+ */
+export function parseString(content: string): string {
+ // Copy the content to the return value.
+ return content;
+}
+
+/**
+ * This reads a number from content and returns the number.
+ * @param _label Unused in this implementation, the label for the Open Graph property.
+ * @param content The value for the Open Graph property.
+ * @returns The string value of the Open Graph content property.
+ */
+export function parseNumber(content: string): number | undefined {
+ // Copy the content to the return value.
+ return content.length > 0 ? parseFloat(content.replaceAll(/[^\d\.]/g, '') || '') : undefined;
+}
+
+/**
+ * This will read the first `` tag whose `property` attribute matches the value in the `propertyName` argument.
+ * Per the protocol, the first tag (from top to bottom) is given preference during conflicts.
+ * @param $ A `CheerioAPI` object.
+ * @param propertyName The property name to find the first content value of.
+ * @param onPropertyFound An optional function which provides an action for when data is located how to parse it. If omitted, performs a default copy.
+ * @returns A generic content (defaults to string type) unless there isn't any, then undefined.
+ */
+export function parseFirstOpenGraphMetaTagContentMatching($: CheerioAPI, propertyName: string): string | undefined;
+export function parseFirstOpenGraphMetaTagContentMatching(
+ $: CheerioAPI,
+ propertyName: string,
+ onPropertyFound?: OpenGraphParseHandler,
+): R | undefined {
+ if (typeof onPropertyFound !== 'function') {
+ onPropertyFound = parseString as OpenGraphParseHandler;
+ }
+ const cssSelector = `meta[property="${propertyName}"]`;
+ let queryResult = $(cssSelector);
+ if (queryResult.length > 0) {
+ const property = queryResult.attr('property');
+ const content = queryResult.attr('content');
+ do {
+ if (property === propertyName && content && content?.toString() && onPropertyFound) {
+ // return the first property found that matches the search label exactly
+ return onPropertyFound(content!);
+ }
+ queryResult = queryResult.next();
+ } while (queryResult.length > 0);
+ }
+ return undefined;
+}
+
+/**
+ * This will read all `` tag whose `property` attribute matches the value in the `propertyName` argument.
+ * Per the protocol, the first tag (from top to bottom) is given preference during conflicts.
+ * @param $ A `CheerioAPI` object.
+ * @param propertyName The property name to find the all content values of.
+ * @param onPropertyFound An optional function which provides an action for when data is located how to parse it. If omitted, performs a default copy.
+ * @returns An array of contents unless there are none, then undefined.
+ */
+export function parseAllOpenGraphMetaTagContentsMatching($: CheerioAPI, propertyName: string): string[] | undefined;
+export function parseAllOpenGraphMetaTagContentsMatching(
+ $: CheerioAPI,
+ propertyName: string,
+ onPropertyFound?: OpenGraphParseHandler,
+): R[] | undefined {
+ if (typeof onPropertyFound !== 'function') {
+ onPropertyFound = parseString as OpenGraphParseHandler;
+ }
+ const cssSelector = `meta[property="${propertyName}"]`;
+ let queryResult = $(cssSelector);
+ if (queryResult.length > 0) {
+ const returns: R[] = [];
+ do {
+ const property = queryResult.attr('property');
+ const content = queryResult.attr('content');
+ if (property === propertyName && content && content?.toString()) {
+ returns.push(onPropertyFound(content!));
+ }
+ queryResult = queryResult.next();
+ } while (queryResult.length > 0);
+ return returns;
+ }
+ return undefined;
+}
+
+/**
+ * One or more Open Graph structured object attributes along with parsers.
+ */
+export interface StructuredObjectAttributeParserConfiguration {
+ ogPropertyName: string;
+ mapPropertyName: string;
+ onStructuredPropertyFound?: OpenGraphParseHandler;
+}
+
+/**
+ *
+ * @param $ A `CheerioAPI` object.
+ * @param propertyName The property name to find the all content values of.
+ * @param defaultObjectPropertyName
+ * @param onTopLevelPropertyFound An optional function which provides an action for when data is located how to parse it. If omitted, performs a default copy.
+ * @param attributeHandlers
+ * @returns
+ */
+export function parseOpenGraphStructuredObjectMatching>(
+ $: CheerioAPI,
+ propertyName: string,
+ defaultObjectPropertyName?: string,
+ onTopLevelPropertyFound?: OpenGraphParseHandler,
+ attributeHandlers?: StructuredObjectAttributeParserConfiguration[],
+): R1 | R2 | R1[] | R2[] | [R1 | R2] | undefined {
+ if (typeof onTopLevelPropertyFound !== 'function') {
+ onTopLevelPropertyFound = parseString as OpenGraphParseHandler;
+ }
+ const cssSelector = `meta[property^="${propertyName}"]`;
+ let queryResult = $(cssSelector);
+ const result: R1 | R2 | R1[] | R2[] | [R1 | R2] = [];
+ if (queryResult.length > 0) {
+ // let's track the most recent root element
+ let mostRecentLabelRoot: Dictionary | any;
+
+ do {
+ // re-read the property, it does match everything starting with og:image
+ const property = queryResult.attr('property');
+ const content = queryResult.attr('content') as any;
+
+ // this is a new property root tag with a url value
+ if (property === propertyName) {
+ // if there was a previous image root tag, add it to the result
+ if (mostRecentLabelRoot!) {
+ result.push(onTopLevelPropertyFound(mostRecentLabelRoot as never));
+ }
+ mostRecentLabelRoot = content;
+ }
+ // this is an structured metadata tag
+ else {
+ // if there isn't anything store yet be sure to create an empty object
+ if (typeof mostRecentLabelRoot! === 'undefined') {
+ mostRecentLabelRoot = {} as Dictionary;
+ }
+ // convert any root tags with only a default value into structures with a default field
+ else if (typeof mostRecentLabelRoot !== 'object' && defaultObjectPropertyName) {
+ let newRecentLabelRoot = {} as Dictionary;
+ const subPropertyName = defaultObjectPropertyName.substring(propertyName.length + 1);
+ newRecentLabelRoot[subPropertyName] = mostRecentLabelRoot;
+ mostRecentLabelRoot = newRecentLabelRoot;
+ }
+ if (typeof attributeHandlers !== 'undefined') {
+ for (const attributeHandler of attributeHandlers) {
+ if (property === attributeHandler.ogPropertyName) {
+ const attributeHandlerOrDefaultParseString =
+ attributeHandler.onStructuredPropertyFound ?? parseString;
+ mostRecentLabelRoot[attributeHandler.mapPropertyName] =
+ attributeHandlerOrDefaultParseString(content as any) as any;
+ break; // break out of the inner loop, continuing with the outer loop
+ }
+ }
+ } else if (property?.indexOf(`${propertyName}:`) != -1) {
+ // if you didn't provide attribute handlers, then we'll just go ahead and assume you wanted them anyways and use defaultCopyOnRead
+ // to handle the parsing
+ mostRecentLabelRoot[property!.substring(propertyName.length + 1)] = parseString(
+ content as any,
+ ) as any;
+ }
+ }
+ // read the next result
+ queryResult = queryResult.next();
+
+ // loop until there are no more results
+ } while (queryResult.length > 0);
+
+ // if there was a previous image root tag, add it to the result
+ if (mostRecentLabelRoot!) {
+ result.push(mostRecentLabelRoot as never);
+ }
+ }
+ return result.length ? ((result.length > 1 ? result : result[0]) as R1 | R2 | R1[] | R2[] | [R1 | R2]) : undefined;
+}
+
+export function parseOpenGraphMetadata($: CheerioAPI): OpenGraphMetadata {
+ return {
+ url: parseFirstOpenGraphMetaTagContentMatching($, 'og:url'),
+ type: parseFirstOpenGraphMetaTagContentMatching($, 'og:type'),
+ title: parseFirstOpenGraphMetaTagContentMatching($, 'og:title'),
+ locale: parseFirstOpenGraphMetaTagContentMatching($, 'og:locale'),
+ localeAlternate: parseAllOpenGraphMetaTagContentsMatching($, 'og:locale:alternate'),
+ image: parseOpenGraphStructuredObjectMatching(
+ $,
+ 'og:image',
+ 'og:image:url',
+ parseString as OpenGraphParseHandler,
+ [
+ {
+ ogPropertyName: 'og:image:secure_url',
+ mapPropertyName: 'secureUrl',
+ onStructuredPropertyFound: parseString as OpenGraphParseHandler,
+ },
+ {
+ ogPropertyName: 'og:image:type',
+ mapPropertyName: 'type',
+ onStructuredPropertyFound: parseString as OpenGraphParseHandler,
+ },
+ {
+ ogPropertyName: 'og:image:width',
+ mapPropertyName: 'width',
+ onStructuredPropertyFound: parseNumber as OpenGraphParseHandler,
+ },
+ {
+ ogPropertyName: 'og:image:height',
+ mapPropertyName: 'height',
+ onStructuredPropertyFound: parseNumber as OpenGraphParseHandler,
+ },
+ {
+ ogPropertyName: 'og:image:alt',
+ mapPropertyName: 'alt',
+ onStructuredPropertyFound: parseString as OpenGraphParseHandler,
+ },
+ ],
+ ),
+ video: parseOpenGraphStructuredObjectMatching(
+ $,
+ 'og:video',
+ 'og:video:url',
+ parseString as OpenGraphParseHandler,
+ [
+ {
+ ogPropertyName: 'og:video:secure_url',
+ mapPropertyName: 'secureUrl',
+ onStructuredPropertyFound: parseString as OpenGraphParseHandler,
+ },
+ {
+ ogPropertyName: 'og:video:type',
+ mapPropertyName: 'type',
+ onStructuredPropertyFound: parseString as OpenGraphParseHandler,
+ },
+ {
+ ogPropertyName: 'og:video:width',
+ mapPropertyName: 'width',
+ onStructuredPropertyFound: parseNumber as OpenGraphParseHandler,
+ },
+ {
+ ogPropertyName: 'og:video:height',
+ mapPropertyName: 'height',
+ onStructuredPropertyFound: parseNumber as OpenGraphParseHandler,
+ },
+ ],
+ ),
+ audio: parseOpenGraphStructuredObjectMatching(
+ $,
+ 'og:audio',
+ 'og:audio:url',
+ parseString as OpenGraphParseHandler,
+ [
+ {
+ ogPropertyName: 'og:audio:secure_url',
+ mapPropertyName: 'secureUrl',
+ onStructuredPropertyFound: parseString as OpenGraphParseHandler,
+ },
+ {
+ ogPropertyName: 'og:audio:type',
+ mapPropertyName: 'type',
+ onStructuredPropertyFound: parseString as OpenGraphParseHandler,
+ },
+ {
+ ogPropertyName: 'og:audio:title',
+ mapPropertyName: 'title',
+ onStructuredPropertyFound: parseString as OpenGraphParseHandler,
+ },
+ {
+ ogPropertyName: 'og:audio:artist',
+ mapPropertyName: 'artist',
+ onStructuredPropertyFound: parseString as OpenGraphParseHandler,
+ },
+ {
+ ogPropertyName: 'og:audio:album',
+ mapPropertyName: 'album',
+ onStructuredPropertyFound: parseString as OpenGraphParseHandler,
+ },
+ ],
+ ),
+ description: parseFirstOpenGraphMetaTagContentMatching($, 'og:description'),
+ siteName: parseFirstOpenGraphMetaTagContentMatching($, 'og:site_name'),
+ determiner: parseFirstOpenGraphMetaTagContentMatching($, 'og:determiner'),
+ geo: parseOpenGraphStructuredObjectMatching($, 'og:geo', '', undefined, [
+ {
+ ogPropertyName: 'og:geo:lat',
+ mapPropertyName: 'latitude',
+ onStructuredPropertyFound: parseNumber as OpenGraphParseHandler,
+ },
+ {
+ ogPropertyName: 'og:geo:long',
+ mapPropertyName: 'longitude',
+ onStructuredPropertyFound: parseNumber as OpenGraphParseHandler,
+ },
+ ]),
+ };
+}
/**
* To be used with the spread operator. Ensures that the item is defined, and is not empty.
@@ -21,140 +497,6 @@ const optionalSpread = (key: string, item: any) =>
item !== undefined && !!Object.values(item)?.length ? { [key]: item } : {};
const OPEN_GRAPH_PROPERTIES: OpenGraphProperty[] = [
- {
- name: 'og:title',
- outputName: 'title',
- children: [],
- },
- {
- name: 'og:type',
- outputName: 'type',
- children: [],
- },
- {
- name: 'og:image',
- outputName: 'image',
- children: [
- {
- name: 'og:image:url',
- outputName: 'url',
- children: [],
- },
- {
- name: 'og:image:secure_url',
- outputName: 'secureUrl',
- children: [],
- },
- {
- name: 'og:image:type',
- outputName: 'type',
- children: [],
- },
- {
- name: 'og:image:width',
- outputName: 'width',
- children: [],
- },
- {
- name: 'og:image:height',
- outputName: 'height',
- children: [],
- },
- {
- name: 'og:image:alt',
- outputName: 'alt',
- children: [],
- },
- ],
- },
- {
- name: 'og:url',
- outputName: 'url',
- children: [],
- },
- {
- name: 'og:audio',
- outputName: 'audio',
- children: [
- {
- name: 'og:audio:url',
- outputName: 'url',
- children: [],
- },
- {
- name: 'og:audio:secure_url',
- outputName: 'secureUrl',
- children: [],
- },
- {
- name: 'og:audio:type',
- outputName: 'type',
- children: [],
- },
- ],
- },
- {
- name: 'og:description',
- outputName: 'description',
- children: [],
- },
- {
- name: 'og:determiner',
- outputName: 'determiner',
- children: [],
- },
- {
- name: 'og:locale',
- outputName: 'locale',
- children: [
- {
- name: 'og:locale:alternate',
- outputName: 'alternate',
- children: [],
- },
- ],
- },
- {
- name: 'og:site_name',
- outputName: 'siteName',
- children: [],
- },
- {
- name: 'og:video',
- outputName: 'video',
- children: [
- {
- name: 'og:video:url',
- outputName: 'url',
- children: [],
- },
- {
- name: 'og:video:secure_url',
- outputName: 'secureUrl',
- children: [],
- },
- {
- name: 'og:video:type',
- outputName: 'type',
- children: [],
- },
- {
- name: 'og:video:width',
- outputName: 'width',
- children: [],
- },
- {
- name: 'og:video:height',
- outputName: 'height',
- children: [],
- },
- {
- name: 'og:video:alt',
- outputName: 'alt',
- children: [],
- },
- ],
- },
// The properties below aren't prefixed with "og".
// Part of the reason the properties have been hardcoded is because not all OpenGraph properties start with "og".
// Especially the newer ones that extend "og:type".
@@ -350,6 +692,8 @@ const OPEN_GRAPH_PROPERTIES: OpenGraphProperty[] = [
children: [],
},
],
+ //TODO: Include other deprecated properties such as geo:lat, geo:long, vcard:street-address, foaf:phone, isbn, upc, etc.
+ //As seen at https://ogp.me/ns/ogp.me.ttl.
},
];
@@ -400,13 +744,30 @@ export function parseOpenGraph($: CheerioAPI, additionalProperties?: OpenGraphPr
export function parseOpenGraph(item: CheerioAPI | string, additionalProperties?: OpenGraphProperty[]) {
const $ = typeof item === 'string' ? load(item) : item;
- return [...(additionalProperties || []), ...OPEN_GRAPH_PROPERTIES].reduce(
- (acc, curr) => {
- return {
- ...acc,
- ...optionalSpread(curr.outputName, parseOpenGraphProperty(curr, $)),
- };
- },
- {} as Dictionary,
+ let ogrDict: Dictionary = {};
+
+ // Parse metadata
+ const basicMetaData: OpenGraphMetadata = parseOpenGraphMetadata($);
+
+ // // Assemble open graph properties to search for
+ let props = [...(additionalProperties || []), ...OPEN_GRAPH_PROPERTIES];
+
+ // Determine cardinality of each element
+ // props = props.map((prop) => {
+ // });
+
+ ogrDict = Object.assign(
+ ogrDict,
+ props.reduce(
+ (acc, curr) => {
+ return {
+ ...acc,
+ ...optionalSpread(curr.outputName, parseOpenGraphProperty(curr, $)),
+ };
+ },
+ {} as Dictionary,
+ ),
+ basicMetaData,
);
+ return ogrDict;
}
diff --git a/test/utils/open_graph_parser.test.ts b/test/utils/open_graph_parser.test.ts
index ac68674fd35b..e70d025639ba 100644
--- a/test/utils/open_graph_parser.test.ts
+++ b/test/utils/open_graph_parser.test.ts
@@ -1,4 +1,4 @@
-import { parseOpenGraph } from '@crawlee/utils';
+import { parseOpenGraph, OpenGraphMetadata } from '@crawlee/utils';
import { load } from 'cheerio';
describe('parseOpenGraph', () => {
@@ -21,6 +21,45 @@ describe('parseOpenGraph', () => {
const case6 = `
`;
+ const case7 = `
+
+
+
+
+
+ Document
+
+
+
+
+
+
+
+
+
+
+
+
+
+ `;
+
+ const case8 = `
+
+
+
+
+
+
+
+
+ `;
+
+ const case9 = `
+
+
+
+ `;
+
it('Should scrape properties', () => {
expect(parseOpenGraph(case1)).toEqual({
title: 'Under Pressure',
@@ -38,13 +77,13 @@ describe('parseOpenGraph', () => {
expect(parsed.videoInfo.actor.actorValue).toContain('bar');
expect(parsed.videoInfo.actor.actorValue).toContain('baz');
- const parsed2 = parseOpenGraph(case3) as {
- locale: { localeValue: string; alternate: string[] };
- };
+ const parsed2 = parseOpenGraph(case3) as OpenGraphMetadata;
expect(parsed2).toHaveProperty('locale');
- expect(parsed2.locale.alternate).toContain('foo');
- expect(parsed2.locale.alternate).toContain('bar');
+ expect(parsed2.locale).toContain('test');
+ expect(parsed2).toHaveProperty('localeAlternate');
+ expect(parsed2.localeAlternate).toContain('foo');
+ expect(parsed2.localeAlternate).toContain('bar');
});
it('Should parse properties regardless of how deeply they are nested', () => {
@@ -77,4 +116,69 @@ describe('parseOpenGraph', () => {
type: 'website',
});
});
+
+ it('Should parse arrays of images with props', () => {
+ const parsed = parseOpenGraph(case7);
+
+ expect(parsed).toEqual({
+ title: 'The Rock',
+ type: 'video.movie',
+ url: 'https://www.imdb.com/title/tt0117500/',
+ image: [
+ // Either this:
+ 'https://ia.media-imdb.com/images/rock.jpg',
+ // Or this:
+ // {
+ // url: 'https://ia.media-imdb.com/images/rock.jpg',
+ // },
+ {
+ url: 'https://example.com/rock2.jpg',
+ width: 300,
+ height: 300,
+ },
+ {
+ url: 'https://example.com/rock3.jpg',
+ height: 1000,
+ },
+ ],
+ });
+ });
+
+ it('Should parse arrays of videos with props', () => {
+ const parsed = parseOpenGraph(case8);
+
+ expect(parsed).toEqual({
+ title: 'The Rock',
+ type: 'video.movie',
+ url: 'https://www.imdb.com/title/tt0117500/',
+ video: [
+ // Either this:
+ 'https://www.youtube.com/watch?v=jGVJx5mOtL8',
+ // Or this:
+ // {
+ // url: 'https://www.youtube.com/watch?v=jGVJx5mOtL8',
+ // },
+ {
+ url: 'https://www.youtube.com/watch?v=a3qcNyjj9ZQ',
+ width: 1920,
+ height: 1080,
+ },
+ {
+ url: 'https://www.youtube.com/watch?v=313n0wga2xo',
+ height: 1080,
+ },
+ ],
+ });
+ });
+
+ it('Should parse deprecated geo:lat and geo:long', () => {
+ const parsed = parseOpenGraph(case9);
+
+ expect(parsed).toEqual({
+ geo: {
+ latitude: 50.081534,
+ longitude: 14.426464,
+ },
+ });
+ });
});