WebScraper
This commit is contained in:
210
src/__tests__/WebScraper.test.ts
Normal file
210
src/__tests__/WebScraper.test.ts
Normal file
@ -0,0 +1,210 @@
|
|||||||
|
import { WebScraper } from '../utils/WebScraper';
|
||||||
|
import { NewsSource } from '../types/Feed';
|
||||||
|
import { Logger } from '../utils/logger';
|
||||||
|
|
||||||
|
// Mock the Logger
|
||||||
|
jest.mock('../utils/logger', () => ({
|
||||||
|
Logger: {
|
||||||
|
error: jest.fn(),
|
||||||
|
warn: jest.fn(),
|
||||||
|
info: jest.fn(),
|
||||||
|
debug: jest.fn()
|
||||||
|
}
|
||||||
|
}));
|
||||||
|
|
||||||
|
// Mock fetch
|
||||||
|
global.fetch = jest.fn();
|
||||||
|
|
||||||
|
describe('WebScraper', () => {
|
||||||
|
let webScraper: WebScraper;
|
||||||
|
const mockFetch = fetch as jest.MockedFunction<typeof fetch>;
|
||||||
|
|
||||||
|
beforeEach(() => {
|
||||||
|
webScraper = new WebScraper();
|
||||||
|
jest.clearAllMocks();
|
||||||
|
});
|
||||||
|
|
||||||
|
describe('scrapeUrl', () => {
|
||||||
|
test('should successfully scrape a URL with complete metadata', async () => {
|
||||||
|
const mockHtml = `
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<title>Test News Article</title>
|
||||||
|
<meta property="og:title" content="Test News Article">
|
||||||
|
<meta property="og:description" content="This is a test news article description">
|
||||||
|
<meta property="article:published_time" content="2024-01-15T10:30:00Z">
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<h1>Test News Article</h1>
|
||||||
|
<p>Article content here...</p>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
`;
|
||||||
|
|
||||||
|
mockFetch.mockResolvedValue({
|
||||||
|
ok: true,
|
||||||
|
text: () => Promise.resolve(mockHtml)
|
||||||
|
} as Response);
|
||||||
|
|
||||||
|
const result = await webScraper.scrapeUrl('https://example.com/news');
|
||||||
|
|
||||||
|
expect(result).toEqual({
|
||||||
|
title: 'Test News Article',
|
||||||
|
description: 'This is a test news article description',
|
||||||
|
url: 'https://example.com/news',
|
||||||
|
publishedAt: new Date('2024-01-15T10:30:00Z')
|
||||||
|
});
|
||||||
|
|
||||||
|
expect(mockFetch).toHaveBeenCalledWith('https://example.com/news', {
|
||||||
|
headers: {
|
||||||
|
'User-Agent': 'Mozilla/5.0 (compatible; DailyTrends/1.0)',
|
||||||
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
|
||||||
|
}
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
test('should handle HTTP errors gracefully', async () => {
|
||||||
|
mockFetch.mockResolvedValue({
|
||||||
|
ok: false,
|
||||||
|
status: 404,
|
||||||
|
statusText: 'Not Found'
|
||||||
|
} as Response);
|
||||||
|
|
||||||
|
const result = await webScraper.scrapeUrl('https://example.com/not-found');
|
||||||
|
|
||||||
|
expect(result).toBeNull();
|
||||||
|
expect(Logger.error).toHaveBeenCalledWith(
|
||||||
|
'Failed to fetch https://example.com/not-found: 404 Not Found'
|
||||||
|
);
|
||||||
|
});
|
||||||
|
|
||||||
|
test('should handle network errors gracefully', async () => {
|
||||||
|
mockFetch.mockRejectedValue(new Error('Network error'));
|
||||||
|
|
||||||
|
const result = await webScraper.scrapeUrl('https://example.com/error');
|
||||||
|
|
||||||
|
expect(result).toBeNull();
|
||||||
|
expect(Logger.error).toHaveBeenCalledWith(
|
||||||
|
'Error scraping https://example.com/error:',
|
||||||
|
expect.any(Error)
|
||||||
|
);
|
||||||
|
});
|
||||||
|
|
||||||
|
test('should return null when no title is found', async () => {
|
||||||
|
const mockHtml = `
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<meta property="og:description" content="Description without title">
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<p>Content without title</p>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
`;
|
||||||
|
|
||||||
|
mockFetch.mockResolvedValue({
|
||||||
|
ok: true,
|
||||||
|
text: () => Promise.resolve(mockHtml)
|
||||||
|
} as Response);
|
||||||
|
|
||||||
|
const result = await webScraper.scrapeUrl('https://example.com/no-title');
|
||||||
|
|
||||||
|
expect(result).toBeNull();
|
||||||
|
expect(Logger.warn).toHaveBeenCalledWith('No title found for https://example.com/no-title');
|
||||||
|
});
|
||||||
|
|
||||||
|
test('should return null when no description is found', async () => {
|
||||||
|
const mockHtml = `
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<title>Title Only</title>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<p>Content without description meta</p>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
`;
|
||||||
|
|
||||||
|
mockFetch.mockResolvedValue({
|
||||||
|
ok: true,
|
||||||
|
text: () => Promise.resolve(mockHtml)
|
||||||
|
} as Response);
|
||||||
|
|
||||||
|
const result = await webScraper.scrapeUrl('https://example.com/no-description');
|
||||||
|
|
||||||
|
expect(result).toBeNull();
|
||||||
|
expect(Logger.warn).toHaveBeenCalledWith('No description found for https://example.com/no-description');
|
||||||
|
});
|
||||||
|
|
||||||
|
test('should use current date when no published date is found', async () => {
|
||||||
|
const mockHtml = `
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<title>Test Article</title>
|
||||||
|
<meta property="og:description" content="Test description">
|
||||||
|
</head>
|
||||||
|
</html>
|
||||||
|
`;
|
||||||
|
|
||||||
|
mockFetch.mockResolvedValue({
|
||||||
|
ok: true,
|
||||||
|
text: () => Promise.resolve(mockHtml)
|
||||||
|
} as Response);
|
||||||
|
|
||||||
|
const beforeScrape = new Date();
|
||||||
|
const result = await webScraper.scrapeUrl('https://example.com/no-date');
|
||||||
|
const afterScrape = new Date();
|
||||||
|
|
||||||
|
expect(result).not.toBeNull();
|
||||||
|
expect(result!.publishedAt.getTime()).toBeGreaterThanOrEqual(beforeScrape.getTime());
|
||||||
|
expect(result!.publishedAt.getTime()).toBeLessThanOrEqual(afterScrape.getTime());
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
describe('convertToFeedData', () => {
|
||||||
|
test('should convert scraped data to feed format', () => {
|
||||||
|
const scrapedData = {
|
||||||
|
title: 'Test News',
|
||||||
|
description: 'Test description',
|
||||||
|
url: 'https://example.com/news',
|
||||||
|
publishedAt: new Date('2024-01-15T10:00:00Z')
|
||||||
|
};
|
||||||
|
|
||||||
|
const feedData = webScraper.convertToFeedData(scrapedData, NewsSource.EL_PAIS);
|
||||||
|
|
||||||
|
expect(feedData).toEqual({
|
||||||
|
title: 'Test News',
|
||||||
|
description: 'Test description',
|
||||||
|
url: 'https://example.com/news',
|
||||||
|
source: NewsSource.EL_PAIS,
|
||||||
|
publishedAt: new Date('2024-01-15T10:00:00Z'),
|
||||||
|
isManual: false
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
test('should handle HTML with special characters and entities', async () => {
|
||||||
|
const htmlWithEntities = `
|
||||||
|
<html>
|
||||||
|
<head>
|
||||||
|
<title>News & Updates - El País</title>
|
||||||
|
<meta name="description" content="Breaking news "today" & analysis">
|
||||||
|
</head>
|
||||||
|
</html>
|
||||||
|
`;
|
||||||
|
|
||||||
|
global.fetch = jest.fn().mockResolvedValue({
|
||||||
|
ok: true,
|
||||||
|
text: () => Promise.resolve(htmlWithEntities)
|
||||||
|
});
|
||||||
|
|
||||||
|
const result = await webScraper.scrapeUrl('https://example.com/news');
|
||||||
|
|
||||||
|
expect(result).toEqual({
|
||||||
|
title: 'News & Updates - El País',
|
||||||
|
description: 'Breaking news "today" & analysis',
|
||||||
|
url: 'https://example.com/news',
|
||||||
|
publishedAt: expect.any(Date)
|
||||||
|
});
|
||||||
|
});
|
||||||
|
});
|
||||||
|
});
|
143
src/utils/WebScraper.ts
Normal file
143
src/utils/WebScraper.ts
Normal file
@ -0,0 +1,143 @@
|
|||||||
|
import { IFeed, NewsSource } from '../types/Feed.js';
|
||||||
|
import { Logger } from './logger.js';
|
||||||
|
|
||||||
|
interface ScrapedData {
|
||||||
|
title: string;
|
||||||
|
description: string;
|
||||||
|
url: string;
|
||||||
|
publishedAt: Date;
|
||||||
|
}
|
||||||
|
|
||||||
|
export class WebScraper {
|
||||||
|
private userAgent = 'Mozilla/5.0 (compatible; DailyTrends/1.0)';
|
||||||
|
|
||||||
|
async scrapeUrl(url: string): Promise<ScrapedData | null> {
|
||||||
|
try {
|
||||||
|
const response = await fetch(url, {
|
||||||
|
headers: {
|
||||||
|
'User-Agent': this.userAgent,
|
||||||
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
if (!response.ok) {
|
||||||
|
Logger.error(`Failed to fetch ${url}: ${response.status} ${response.statusText}`);
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
const html = await response.text();
|
||||||
|
return this.parseHtml(html, url);
|
||||||
|
} catch (error) {
|
||||||
|
Logger.error(`Error scraping ${url}:`, error);
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private parseHtml(html: string, url: string): ScrapedData | null {
|
||||||
|
try {
|
||||||
|
// Extract title from <title> tag or Open Graph
|
||||||
|
const title = this.extractTitle(html);
|
||||||
|
if (!title) {
|
||||||
|
Logger.warn(`No title found for ${url}`);
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Extract description from meta tags
|
||||||
|
const description = this.extractDescription(html);
|
||||||
|
if (!description) {
|
||||||
|
Logger.warn(`No description found for ${url}`);
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Extract published date
|
||||||
|
const publishedAt = this.extractPublishedDate(html);
|
||||||
|
|
||||||
|
return {
|
||||||
|
title: title.trim(),
|
||||||
|
description: description.trim(),
|
||||||
|
url,
|
||||||
|
publishedAt
|
||||||
|
};
|
||||||
|
} catch (error) {
|
||||||
|
Logger.error(`Error parsing HTML for ${url}:`, error);
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
private extractTitle(html: string): string | null {
|
||||||
|
// Try Open Graph title first
|
||||||
|
const ogTitleMatch = html.match(/<meta\s+property=["']og:title["']\s+content=["']([^"']+)["']/i);
|
||||||
|
if (ogTitleMatch) {
|
||||||
|
return ogTitleMatch[1];
|
||||||
|
}
|
||||||
|
|
||||||
|
// Try Twitter title
|
||||||
|
const twitterTitleMatch = html.match(/<meta\s+name=["']twitter:title["']\s+content=["']([^"']+)["']/i);
|
||||||
|
if (twitterTitleMatch) {
|
||||||
|
return twitterTitleMatch[1];
|
||||||
|
}
|
||||||
|
|
||||||
|
// Fall back to <title> tag
|
||||||
|
const titleMatch = html.match(/<title[^>]*>([^<]+)<\/title>/i);
|
||||||
|
if (titleMatch) {
|
||||||
|
return titleMatch[1];
|
||||||
|
}
|
||||||
|
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
private extractDescription(html: string): string | null {
|
||||||
|
// Try Open Graph description first
|
||||||
|
const ogDescMatch = html.match(/<meta\s+property=["']og:description["']\s+content=["']([^"']+)["']/i);
|
||||||
|
if (ogDescMatch) {
|
||||||
|
return ogDescMatch[1];
|
||||||
|
}
|
||||||
|
|
||||||
|
// Try Twitter description
|
||||||
|
const twitterDescMatch = html.match(/<meta\s+name=["']twitter:description["']\s+content=["']([^"']+)["']/i);
|
||||||
|
if (twitterDescMatch) {
|
||||||
|
return twitterDescMatch[1];
|
||||||
|
}
|
||||||
|
|
||||||
|
// Try meta description
|
||||||
|
const metaDescMatch = html.match(/<meta\s+name=["']description["']\s+content=["']([^"']+)["']/i);
|
||||||
|
if (metaDescMatch) {
|
||||||
|
return metaDescMatch[1];
|
||||||
|
}
|
||||||
|
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
private extractPublishedDate(html: string): Date {
|
||||||
|
// Try various date formats
|
||||||
|
const datePatterns = [
|
||||||
|
/<meta\s+property=["']article:published_time["']\s+content=["']([^"']+)["']/i,
|
||||||
|
/<meta\s+name=["']pubdate["']\s+content=["']([^"']+)["']/i,
|
||||||
|
/<time[^>]+datetime=["']([^"']+)["']/i
|
||||||
|
];
|
||||||
|
|
||||||
|
for (const pattern of datePatterns) {
|
||||||
|
const match = html.match(pattern);
|
||||||
|
if (match) {
|
||||||
|
const date = new Date(match[1]);
|
||||||
|
if (!isNaN(date.getTime())) {
|
||||||
|
return date;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Default to current date if no published date found
|
||||||
|
return new Date();
|
||||||
|
}
|
||||||
|
|
||||||
|
convertToFeedData(scrapedData: ScrapedData, source: NewsSource): Omit<IFeed, '_id' | 'createdAt' | 'updatedAt'> {
|
||||||
|
return {
|
||||||
|
title: scrapedData.title,
|
||||||
|
description: scrapedData.description,
|
||||||
|
url: scrapedData.url,
|
||||||
|
source,
|
||||||
|
publishedAt: scrapedData.publishedAt,
|
||||||
|
isManual: false
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
Reference in New Issue
Block a user