-
Notifications
You must be signed in to change notification settings - Fork 4
431 website scraper #433
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
431 website scraper #433
Changes from all commits
cdf3d57
1bb7f2a
7cac820
ebba36b
41442ac
c53f23d
18cb677
40f72b4
23eb600
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,64 @@ | ||
| import { Types } from 'mongoose'; | ||
| import parentLogger from 'src/config/logger'; | ||
|
|
||
| import { CalendarSpec, Client, ScheduleHandle, ScheduleOverlapPolicy } from '@temporalio/client'; | ||
|
|
||
| import config from '../../config'; | ||
| import { TemporalCoreService } from './core.service'; | ||
|
|
||
| const logger = parentLogger.child({ module: 'WebsiteTemporalService' }); | ||
|
|
||
| class TemporalWebsiteService extends TemporalCoreService { | ||
| public async createSchedule(platformId: Types.ObjectId): Promise<ScheduleHandle> { | ||
| const initiationTime = new Date(); | ||
| const dayNumber = initiationTime.getUTCDay(); | ||
| const hour = initiationTime.getUTCHours(); | ||
| const minute = initiationTime.getUTCMinutes(); | ||
| const DAY_NAMES = ['SUNDAY', 'MONDAY', 'TUESDAY', 'WEDNESDAY', 'THURSDAY', 'FRIDAY', 'SATURDAY'] as const; | ||
| const dayOfWeek = DAY_NAMES[dayNumber]; | ||
|
|
||
| const calendarSpec: CalendarSpec = { | ||
| dayOfWeek, | ||
| hour, | ||
| minute, | ||
| comment: `Weekly schedule for ${dayOfWeek} at ${hour}:${minute} UTC`, | ||
| }; | ||
|
|
||
| try { | ||
| const client: Client = await this.getClient(); | ||
|
|
||
| return client.schedule.create({ | ||
| scheduleId: `website/${platformId}`, | ||
| spec: { | ||
| calendars: [calendarSpec], | ||
| }, | ||
| action: { | ||
| type: 'startWorkflow', | ||
| workflowType: 'WebsiteIngestionSchedulerWorkflow', | ||
| args: [{ platformId }], | ||
| taskQueue: config.temporal.heavyQueue, | ||
| }, | ||
| policies: { | ||
| catchupWindow: '1 day', | ||
| overlap: ScheduleOverlapPolicy.SKIP, | ||
| }, | ||
| }); | ||
| } catch (error) { | ||
| throw new Error(`Failed to create or update website ingestion schedule: ${(error as Error).message}`); | ||
| } | ||
| } | ||
|
|
||
| public async pauseSchedule(scheduleId: string): Promise<void> { | ||
| const client: Client = await this.getClient(); | ||
| const handle = client.schedule.getHandle(scheduleId); | ||
| await handle.pause(); | ||
| } | ||
|
|
||
| public async deleteSchedule(scheduleId: string): Promise<void> { | ||
| const client: Client = await this.getClient(); | ||
| const handle = client.schedule.getHandle(scheduleId); | ||
| await handle.delete(); | ||
| } | ||
| } | ||
|
|
||
| export default new TemporalWebsiteService(); |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,33 @@ | ||
| import { Types } from 'mongoose'; | ||
|
|
||
| import parentLogger from '../../config/logger'; | ||
| import { ApiError } from '../../utils'; | ||
| import temporalWebsite from '../temporal/website.service'; | ||
|
|
||
| const logger = parentLogger.child({ module: 'WebsiteCoreService' }); | ||
|
|
||
| async function createWebsiteSchedule(platformId: Types.ObjectId): Promise<string> { | ||
| try { | ||
| const schedule = await temporalWebsite.createSchedule(platformId); | ||
| logger.info(`Started schedule '${schedule.scheduleId}'`); | ||
| await schedule.trigger(); | ||
| return schedule.scheduleId; | ||
| } catch (error) { | ||
| logger.error(error, 'Failed to trigger website schedule.'); | ||
| throw new ApiError(590, 'Failed to create website schedule.'); | ||
| } | ||
| } | ||
|
|
||
| async function deleteWebsiteSchedule(scheduleId: string): Promise<void> { | ||
| try { | ||
| await temporalWebsite.deleteSchedule(scheduleId); | ||
| } catch (error) { | ||
| logger.error(error, 'Failed to delete website schedule.'); | ||
| throw new ApiError(590, 'Failed to delete website schedule.'); | ||
| } | ||
| } | ||
|
|
||
| export default { | ||
| createWebsiteSchedule, | ||
| deleteWebsiteSchedule, | ||
| }; |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,5 @@ | ||
| import coreService from './core.service'; | ||
|
|
||
| export default { | ||
| coreService, | ||
| }; |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -28,6 +28,12 @@ const discordUpdateMetadata = () => { | |
| analyzerStartedAt: Joi.date(), | ||
| }); | ||
| }; | ||
|
|
||
| const websiteUpdateMetadata = () => { | ||
| return Joi.object().keys({ | ||
| resources: Joi.array().items(Joi.string().uri({ scheme: ['http', 'https'] })), | ||
| }); | ||
| }; | ||
|
Comment on lines
+32
to
+36
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 🛠️ Refactor suggestion Enhance website resource validation with additional safeguards. While URI validation is good, consider adding:
Example enhancement: const websiteMetadata = () => {
return Joi.object().keys({
resources: Joi.array()
.items(Joi.string().uri({ scheme: ['http', 'https'] }))
+ .max(100) // Prevent excessive resource lists
.required(),
+ rateLimit: Joi.object().keys({
+ requestsPerMinute: Joi.number().min(1).max(60).required(),
+ concurrency: Joi.number().min(1).max(10).required()
+ }).required(),
+ allowedDomains: Joi.array().items(Joi.string().domain()).required()
});
};Also applies to: 101-107 |
||
| const twitterMetadata = () => { | ||
| return Joi.object().keys({ | ||
| id: Joi.string().required(), | ||
|
|
@@ -92,6 +98,14 @@ const discourseMetadata = () => { | |
| }); | ||
| }; | ||
|
|
||
| const websiteMetadata = () => { | ||
| return Joi.object().keys({ | ||
| resources: Joi.array() | ||
| .items(Joi.string().uri({ scheme: ['http', 'https'] })) | ||
| .required(), | ||
| }); | ||
| }; | ||
|
|
||
| const createPlatform = { | ||
| body: Joi.object().keys({ | ||
| name: Joi.string() | ||
|
|
@@ -130,7 +144,11 @@ const createPlatform = { | |
| }, | ||
| { | ||
| is: PlatformNames.Telegram, | ||
| then: telegramMetadata, | ||
| then: telegramMetadata(), | ||
| }, | ||
| { | ||
| is: PlatformNames.Website, | ||
| then: websiteMetadata(), | ||
| }, | ||
| ], | ||
| }).required(), | ||
|
|
@@ -201,6 +219,16 @@ const dynamicUpdatePlatform = (req: Request) => { | |
| }), | ||
| }; | ||
| } | ||
| case PlatformNames.Website: { | ||
| return { | ||
| params: Joi.object().keys({ | ||
| platformId: Joi.required().custom(objectId), | ||
| }), | ||
| body: Joi.object().required().keys({ | ||
| metadata: websiteUpdateMetadata(), | ||
| }), | ||
| }; | ||
| } | ||
| default: | ||
| req.allowInput = false; | ||
| return {}; | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
🛠️ Refactor suggestion
Consider enhancing website metadata validation.
The empty object schema for website metadata might be too permissive. Consider adding validation for essential website-related fields such as: