Skip to content

Commit 1adf54c

Browse files
committed
first successful scrape of the new api
1 parent 6bc33c2 commit 1adf54c

File tree

5 files changed

+189
-87
lines changed

5 files changed

+189
-87
lines changed

scrapers/nus-v2/src/services/io/elastic.ts

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -148,6 +148,10 @@ export default class ElasticPersist implements Persist {
148148
}
149149
}
150150

151+
if (bulkBody.length === 0) {
152+
return;
153+
}
154+
151155
const client = await this.client;
152156
const res = await client.bulk({
153157
index: INDEX_NAME,

scrapers/nus-v2/src/services/nus-api.ts

Lines changed: 99 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ import type {
2020
import type { ModuleCode } from '../types/modules';
2121

2222
import { AuthError, NotFoundError, UnknownApiError } from '../utils/errors';
23+
import { fromTermCode } from '../utils/api';
2324
import config from '../config';
2425

2526
// Interface extracted for easier mocking
@@ -142,6 +143,40 @@ function mapErrorCode(code: string, msg: string) {
142143
return error;
143144
}
144145

146+
/**
147+
* Maps the 4-digit term code to the parameters expected by the CourseNUSMods API.
148+
*/
149+
function mapTermToApiParams(term: string) {
150+
const [acadYear, semester] = fromTermCode(term);
151+
152+
// 2024/2025 -> 2024/25
153+
const yearParts = acadYear.split('/');
154+
const shortYear = `${yearParts[0]}/${yearParts[1].slice(2)}`;
155+
156+
let applicableInSem = '';
157+
switch (semester) {
158+
case 1:
159+
applicableInSem = 'Semester 1';
160+
break;
161+
case 2:
162+
applicableInSem = 'Semester 2';
163+
break;
164+
case 3:
165+
applicableInSem = 'Special Semester (Part 1)';
166+
break;
167+
case 4:
168+
applicableInSem = 'Special Semester (Part 2)';
169+
break;
170+
default:
171+
applicableInSem = `Semester ${semester}`;
172+
}
173+
174+
return {
175+
applicableInYear: shortYear,
176+
applicableInSem,
177+
};
178+
}
179+
145180
/* eslint-disable camelcase */
146181

147182
/**
@@ -263,16 +298,59 @@ class NusApi implements INusApi {
263298
* Calls the modules endpoint
264299
*/
265300
callModulesEndpoint = async (term: string, params: ApiParams): Promise<ModuleInfo[]> => {
301+
const termParams = mapTermToApiParams(term);
302+
const maxItems = 1000;
303+
const baseParams = {
304+
...termParams,
305+
...params,
306+
latestVersionOnly: 'True',
307+
publishedOnly: 'True',
308+
maxItems: String(maxItems),
309+
};
310+
266311
try {
267-
const { data: modules } = await this.callApi<ModuleInfo[]>(
312+
// 1. Fetch the first page to get the total itemCount
313+
const firstResponse = await this.callApi<{ data: ModuleInfo[]; itemCount: number }>(
268314
'CourseNUSMods',
269315
{
270-
term,
271-
...params,
316+
...baseParams,
317+
offset: '0',
272318
},
273319
courseHeaders,
274320
);
275-
return modules;
321+
322+
const allModules = [...firstResponse.data.data];
323+
const { itemCount } = firstResponse.data;
324+
325+
// 2. If there are more items, fetch the remaining pages in parallel.
326+
// Since this.callApi uses a queue, concurrency will still be limited.
327+
const remainingPages = [];
328+
for (let offset = allModules.length; offset < itemCount; offset += maxItems) {
329+
remainingPages.push(
330+
this.callApi<{ data: ModuleInfo[]; itemCount: number }>(
331+
'CourseNUSMods',
332+
{
333+
...baseParams,
334+
offset: String(offset),
335+
},
336+
courseHeaders,
337+
),
338+
);
339+
}
340+
341+
if (remainingPages.length > 0) {
342+
const responses = await Promise.all(remainingPages);
343+
responses.forEach((response) => {
344+
allModules.push(...response.data.data);
345+
});
346+
}
347+
348+
console.log(
349+
`[API] CourseNUSMods fetched ${allModules.length}/${itemCount} results`,
350+
baseParams,
351+
);
352+
353+
return allModules;
276354
} catch (e) {
277355
// The modules endpoint will return NotFound even for valid inputs
278356
// that just happen to have no records, so we ignore this error
@@ -313,26 +391,35 @@ class NusApi implements INusApi {
313391
}
314392

315393
// catalognbr = Catalog number
316-
const [subject, catalognbr] = parts;
317-
const { data: modules } = await this.callApi<ModuleInfo[]>(
394+
const [, subject, catalognbr] = parts;
395+
const termParams = mapTermToApiParams(term);
396+
const { data: response } = await this.callApi<{ data: ModuleInfo[]; itemCount: number }>(
318397
'CourseNUSMods',
319398
{
320-
term,
321-
subject,
322-
catalognbr,
399+
...termParams,
400+
subjectArea: subject,
401+
catalogNbr: catalognbr,
402+
latestVersionOnly: 'True',
403+
publishedOnly: 'True',
323404
},
324405
courseHeaders,
325406
);
407+
const modules = response.data;
326408

409+
console.log(`[API] CourseNUSMods returned ${response.itemCount} result(s) for ${moduleCode}`);
327410
if (modules.length === 0) throw new NotFoundError(`Module ${moduleCode} cannot be found`);
328411
return modules[0];
329412
};
330413

331414
getFacultyModules = async (term: string, facultyCode: string) =>
332-
this.callModulesEndpoint(term, { acadgroup: facultyCode });
415+
this.callModulesEndpoint(term, { acadGroupCode: facultyCode.slice(0, 3) });
333416

334-
getDepartmentModules = async (term: string, departmentCode: string): Promise<ModuleInfo[]> =>
335-
this.callModulesEndpoint(term, { acadorg: departmentCode });
417+
getDepartmentModules = async (term: string, departmentCode: string): Promise<ModuleInfo[]> => {
418+
const modules = await this.callModulesEndpoint(term, {
419+
acadGroupCode: departmentCode.slice(0, 3),
420+
});
421+
return modules.filter((module) => module.OrganisationCode === departmentCode);
422+
};
336423

337424
getModuleTimetable = async (term: string, module: ModuleCode): Promise<TimetableLesson[]> =>
338425
this.callV1Api(

scrapers/nus-v2/src/tasks/GetSemesterData.ts

Lines changed: 40 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -144,7 +144,9 @@ export function cleanModuleInfo(module: SemesterModule) {
144144
* Parse the workload string into a mapping of individual components to their hours.
145145
* If the string is unparsable, it is returned without any modification.
146146
*/
147-
export function parseWorkload(workloadString: string): Workload {
147+
export function parseWorkload(workloadString: string | null | undefined): Workload {
148+
if (!workloadString) return '';
149+
148150
const cleanedWorkloadString = workloadString
149151
.replace(/\(.*?\)/g, '') // Remove stuff in parenthesis
150152
.replace(/NA/gi, '0') // Replace 'NA' with 0
@@ -178,52 +180,54 @@ const mapModuleInfo = (
178180
departmentMap: DepartmentCodeMap,
179181
facultyMap: FacultyCodeMap,
180182
logger: Logger,
183+
acadYear: string,
181184
): SemesterModule => {
182185
const {
183-
Term,
184-
AcademicOrganisation,
186+
OrganisationCode,
185187
AcademicGroup,
186-
CourseTitle,
187-
AdditionalInformation,
188-
WorkLoadHours,
189-
GradingBasisDesc,
190-
Preclusion,
188+
Title,
189+
WorkloadHoursNUSMods,
190+
PreclusionSummary,
191191
PreclusionRule,
192-
PreRequisite,
193-
PreRequisiteRule,
194192
PreRequisiteAdvisory,
195-
CoRequisite,
196-
CoRequisiteRule,
197-
ModularCredit,
198-
Description,
199-
Subject,
193+
PrerequisiteRule,
194+
PrerequisiteSummary,
195+
CorequisiteRule,
196+
CorequisiteSummary,
197+
UnitsMin,
198+
CourseDesc,
199+
SubjectArea,
200200
CatalogNumber,
201-
ModuleAttributes = [],
201+
CourseAttributes = [],
202202
} = moduleInfo;
203203

204-
const [AcadYear] = fromTermCode(Term);
205-
206204
// We map department from our department list because
207205
// AcademicOrganisation.Description is empty for some reason
208206
return {
209-
acadYear: AcadYear,
210-
preclusion: Preclusion,
207+
acadYear,
208+
preclusion: PreclusionSummary,
211209
preclusionRule: PreclusionRule,
212-
description: Description,
213-
title: CourseTitle,
214-
additionalInformation: AdditionalInformation,
215-
department: departmentMap[AcademicOrganisation.Code],
216-
faculty: facultyMap[AcademicGroup.Code],
217-
workload: parseWorkload(WorkLoadHours),
218-
gradingBasisDescription: GradingBasisDesc,
219-
prerequisite: PreRequisite,
220-
prerequisiteRule: PreRequisiteRule,
210+
description: CourseDesc,
211+
title: Title,
212+
additionalInformation: '', // Missing in new API?
213+
department: departmentMap[OrganisationCode],
214+
faculty: facultyMap[AcademicGroup],
215+
workload: parseWorkload(WorkloadHoursNUSMods),
216+
gradingBasisDescription: '', // Missing in new API?
217+
prerequisite: PrerequisiteSummary,
218+
prerequisiteRule: PrerequisiteRule,
221219
prerequisiteAdvisory: PreRequisiteAdvisory,
222-
corequisite: CoRequisite,
223-
corequisiteRule: CoRequisiteRule,
224-
moduleCredit: ModularCredit,
225-
moduleCode: Subject + CatalogNumber,
226-
attributes: mapAttributes(ModuleAttributes, logger),
220+
corequisite: CorequisiteSummary,
221+
corequisiteRule: CorequisiteRule,
222+
moduleCredit: String(UnitsMin),
223+
moduleCode: SubjectArea + CatalogNumber,
224+
attributes: mapAttributes(
225+
CourseAttributes.map((attr) => ({
226+
CourseAttribute: attr.Code,
227+
CourseAttributeValue: attr.Value,
228+
})),
229+
logger,
230+
),
227231
};
228232
};
229233

@@ -284,10 +288,7 @@ export default class GetSemesterData extends BaseTask implements Task<Input, Out
284288
const facultyMap = getFacultyCodeMap(input.faculties);
285289

286290
// Key modules by their module code for easier mapping
287-
const modulesMap = keyBy(
288-
modules,
289-
(moduleInfo) => moduleInfo.Subject + moduleInfo.CatalogNumber,
290-
);
291+
const modulesMap = keyBy(modules, (moduleInfo) => moduleInfo.SubjectArea + moduleInfo.CatalogNumber);
291292

292293
// Combine all three source of data into one set of semester module info.
293294
//
@@ -300,7 +301,7 @@ export default class GetSemesterData extends BaseTask implements Task<Input, Out
300301

301302
// Map module info to the shape expected by our frontend and clean up
302303
// the data by removing nil fields and fixing data issues
303-
const rawModule = mapModuleInfo(moduleInfo, departmentMap, facultyMap, logger);
304+
const rawModule = mapModuleInfo(moduleInfo, departmentMap, facultyMap, logger, academicYear);
304305
const module = cleanModuleInfo(rawModule);
305306

306307
const timetable = timetables[moduleCode];

scrapers/nus-v2/src/tasks/GetSemesterModules.ts

Lines changed: 22 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -54,38 +54,49 @@ export default class GetSemesterModules extends BaseTask implements Task<Input,
5454

5555
const term = getTermCode(this.semester, this.academicYear);
5656

57-
// We make a new request for each department because the API will timeout if
57+
let downloadedCount = 0;
58+
const totalFaculties = input.faculties.length;
59+
60+
// We make a new request for each faculty because the API will timeout if
5861
// we try to request for all of them in one shot
59-
const requests = input.departments.map(async (department) => {
62+
const requests = input.faculties.map(async (faculty, index) => {
6063
try {
61-
const getModules = () =>
62-
this.api.getDepartmentModules(term, department.AcademicOrganisation);
64+
const getModules = () => this.api.getFacultyModules(term, faculty.AcademicGroup);
6365
const modules = await retry(getModules, 3, (error) => error instanceof UnknownApiError);
6466

6567
// Only return modules which are visible in the system
6668
const [printed, hidden] = partition(
6769
modules,
68-
(module: ModuleInfo) => module.PrintCatalog === 'Y',
70+
(module: ModuleInfo) => module.PrintCatalog !== 'N',
71+
);
72+
73+
downloadedCount += printed.length;
74+
this.logger.info(
75+
'[%d/%d] Downloaded %i modules from %s (Total: %d)',
76+
index + 1,
77+
totalFaculties,
78+
printed.length,
79+
faculty.Description,
80+
downloadedCount,
6981
);
7082

71-
this.logger.debug('Downloaded %i modules from %s', printed.length, department.Description);
7283
if (hidden.length > 0) {
7384
this.logger.debug('Filtered out %i non-print modules', hidden.length);
7485
}
7586

7687
printed.forEach(
7788
(module) =>
78-
!!containsNbsps(module.Description) &&
89+
!!containsNbsps(module.CourseDesc) &&
7990
this.logger.error(
80-
{ moduleCode: `${module.Subject}${module.CatalogNumber}` },
81-
`${module.Subject}${module.CatalogNumber}: Module description contains non-breaking spaces`,
91+
{ moduleCode: `${module.SubjectArea}${module.CatalogNumber}` },
92+
`${module.SubjectArea}${module.CatalogNumber}: Module description contains non-breaking spaces`,
8293
),
8394
);
8495

8596
return printed;
8697
} catch (e) {
87-
this.logger.error(e, `Cannot get modules from ${department.Description}`);
88-
throw new TaskError(`Cannot get modules from ${department.Description}`, this, e);
98+
this.logger.error(e, `Cannot get modules from ${faculty.Description}`);
99+
throw new TaskError(`Cannot get modules from ${faculty.Description}`, this, e);
89100
}
90101
});
91102

0 commit comments

Comments
 (0)