diff --git a/src/__tests__/WebScraper.test.ts b/src/__tests__/WebScraper.test.ts new file mode 100644 index 0000000..9f85402 --- /dev/null +++ b/src/__tests__/WebScraper.test.ts @@ -0,0 +1,210 @@ +import { WebScraper } from '../utils/WebScraper'; +import { NewsSource } from '../types/Feed'; +import { Logger } from '../utils/logger'; + +// Mock the Logger +jest.mock('../utils/logger', () => ({ + Logger: { + error: jest.fn(), + warn: jest.fn(), + info: jest.fn(), + debug: jest.fn() + } +})); + +// Mock fetch +global.fetch = jest.fn(); + +describe('WebScraper', () => { + let webScraper: WebScraper; + const mockFetch = fetch as jest.MockedFunction; + + beforeEach(() => { + webScraper = new WebScraper(); + jest.clearAllMocks(); + }); + + describe('scrapeUrl', () => { + test('should successfully scrape a URL with complete metadata', async () => { + const mockHtml = ` + + + Test News Article + + + + + +

Test News Article

+

Article content here...

+ + + `; + + mockFetch.mockResolvedValue({ + ok: true, + text: () => Promise.resolve(mockHtml) + } as Response); + + const result = await webScraper.scrapeUrl('https://example.com/news'); + + expect(result).toEqual({ + title: 'Test News Article', + description: 'This is a test news article description', + url: 'https://example.com/news', + publishedAt: new Date('2024-01-15T10:30:00Z') + }); + + expect(mockFetch).toHaveBeenCalledWith('https://example.com/news', { + headers: { + 'User-Agent': 'Mozilla/5.0 (compatible; DailyTrends/1.0)', + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8' + } + }); + }); + + test('should handle HTTP errors gracefully', async () => { + mockFetch.mockResolvedValue({ + ok: false, + status: 404, + statusText: 'Not Found' + } as Response); + + const result = await webScraper.scrapeUrl('https://example.com/not-found'); + + expect(result).toBeNull(); + expect(Logger.error).toHaveBeenCalledWith( + 'Failed to fetch https://example.com/not-found: 404 Not Found' + ); + }); + + test('should handle network errors gracefully', async () => { + mockFetch.mockRejectedValue(new Error('Network error')); + + const result = await webScraper.scrapeUrl('https://example.com/error'); + + expect(result).toBeNull(); + expect(Logger.error).toHaveBeenCalledWith( + 'Error scraping https://example.com/error:', + expect.any(Error) + ); + }); + + test('should return null when no title is found', async () => { + const mockHtml = ` + + + + + +

Content without title

+ + + `; + + mockFetch.mockResolvedValue({ + ok: true, + text: () => Promise.resolve(mockHtml) + } as Response); + + const result = await webScraper.scrapeUrl('https://example.com/no-title'); + + expect(result).toBeNull(); + expect(Logger.warn).toHaveBeenCalledWith('No title found for https://example.com/no-title'); + }); + + test('should return null when no description is found', async () => { + const mockHtml = ` + + + Title Only + + +

Content without description meta

+ + + `; + + mockFetch.mockResolvedValue({ + ok: true, + text: () => Promise.resolve(mockHtml) + } as Response); + + const result = await webScraper.scrapeUrl('https://example.com/no-description'); + + expect(result).toBeNull(); + expect(Logger.warn).toHaveBeenCalledWith('No description found for https://example.com/no-description'); + }); + + test('should use current date when no published date is found', async () => { + const mockHtml = ` + + + Test Article + + + + `; + + mockFetch.mockResolvedValue({ + ok: true, + text: () => Promise.resolve(mockHtml) + } as Response); + + const beforeScrape = new Date(); + const result = await webScraper.scrapeUrl('https://example.com/no-date'); + const afterScrape = new Date(); + + expect(result).not.toBeNull(); + expect(result!.publishedAt.getTime()).toBeGreaterThanOrEqual(beforeScrape.getTime()); + expect(result!.publishedAt.getTime()).toBeLessThanOrEqual(afterScrape.getTime()); + }); + }); + + describe('convertToFeedData', () => { + test('should convert scraped data to feed format', () => { + const scrapedData = { + title: 'Test News', + description: 'Test description', + url: 'https://example.com/news', + publishedAt: new Date('2024-01-15T10:00:00Z') + }; + + const feedData = webScraper.convertToFeedData(scrapedData, NewsSource.EL_PAIS); + + expect(feedData).toEqual({ + title: 'Test News', + description: 'Test description', + url: 'https://example.com/news', + source: NewsSource.EL_PAIS, + publishedAt: new Date('2024-01-15T10:00:00Z'), + isManual: false + }); + }); + + test('should handle HTML with special characters and entities', async () => { + const htmlWithEntities = ` + + + News & Updates - El País + + + + `; + + global.fetch = jest.fn().mockResolvedValue({ + ok: true, + text: () => Promise.resolve(htmlWithEntities) + }); + + const result = await webScraper.scrapeUrl('https://example.com/news'); + + expect(result).toEqual({ + title: 'News & Updates - El País', + description: 'Breaking news "today" & analysis', + url: 'https://example.com/news', + publishedAt: expect.any(Date) + }); + }); +}); +}); \ No newline at end of file diff --git a/src/utils/WebScraper.ts b/src/utils/WebScraper.ts new file mode 100644 index 0000000..5606c3a --- /dev/null +++ b/src/utils/WebScraper.ts @@ -0,0 +1,143 @@ +import { IFeed, NewsSource } from '../types/Feed.js'; +import { Logger } from './logger.js'; + +interface ScrapedData { + title: string; + description: string; + url: string; + publishedAt: Date; +} + +export class WebScraper { + private userAgent = 'Mozilla/5.0 (compatible; DailyTrends/1.0)'; + + async scrapeUrl(url: string): Promise { + try { + const response = await fetch(url, { + headers: { + 'User-Agent': this.userAgent, + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8' + } + }); + + if (!response.ok) { + Logger.error(`Failed to fetch ${url}: ${response.status} ${response.statusText}`); + return null; + } + + const html = await response.text(); + return this.parseHtml(html, url); + } catch (error) { + Logger.error(`Error scraping ${url}:`, error); + return null; + } + } + + private parseHtml(html: string, url: string): ScrapedData | null { + try { + // Extract title from tag or Open Graph + const title = this.extractTitle(html); + if (!title) { + Logger.warn(`No title found for ${url}`); + return null; + } + + // Extract description from meta tags + const description = this.extractDescription(html); + if (!description) { + Logger.warn(`No description found for ${url}`); + return null; + } + + // Extract published date + const publishedAt = this.extractPublishedDate(html); + + return { + title: title.trim(), + description: description.trim(), + url, + publishedAt + }; + } catch (error) { + Logger.error(`Error parsing HTML for ${url}:`, error); + return null; + } + } + + private extractTitle(html: string): string | null { + // Try Open Graph title first + const ogTitleMatch = html.match(/<meta\s+property=["']og:title["']\s+content=["']([^"']+)["']/i); + if (ogTitleMatch) { + return ogTitleMatch[1]; + } + + // Try Twitter title + const twitterTitleMatch = html.match(/<meta\s+name=["']twitter:title["']\s+content=["']([^"']+)["']/i); + if (twitterTitleMatch) { + return twitterTitleMatch[1]; + } + + // Fall back to <title> tag + const titleMatch = html.match(/<title[^>]*>([^<]+)<\/title>/i); + if (titleMatch) { + return titleMatch[1]; + } + + return null; + } + + private extractDescription(html: string): string | null { + // Try Open Graph description first + const ogDescMatch = html.match(/<meta\s+property=["']og:description["']\s+content=["']([^"']+)["']/i); + if (ogDescMatch) { + return ogDescMatch[1]; + } + + // Try Twitter description + const twitterDescMatch = html.match(/<meta\s+name=["']twitter:description["']\s+content=["']([^"']+)["']/i); + if (twitterDescMatch) { + return twitterDescMatch[1]; + } + + // Try meta description + const metaDescMatch = html.match(/<meta\s+name=["']description["']\s+content=["']([^"']+)["']/i); + if (metaDescMatch) { + return metaDescMatch[1]; + } + + return null; + } + + private extractPublishedDate(html: string): Date { + // Try various date formats + const datePatterns = [ + /<meta\s+property=["']article:published_time["']\s+content=["']([^"']+)["']/i, + /<meta\s+name=["']pubdate["']\s+content=["']([^"']+)["']/i, + /<time[^>]+datetime=["']([^"']+)["']/i + ]; + + for (const pattern of datePatterns) { + const match = html.match(pattern); + if (match) { + const date = new Date(match[1]); + if (!isNaN(date.getTime())) { + return date; + } + } + } + + // Default to current date if no published date found + return new Date(); + } + + convertToFeedData(scrapedData: ScrapedData, source: NewsSource): Omit<IFeed, '_id' | 'createdAt' | 'updatedAt'> { + return { + title: scrapedData.title, + description: scrapedData.description, + url: scrapedData.url, + source, + publishedAt: scrapedData.publishedAt, + isManual: false + }; + } +} \ No newline at end of file