WebScraper

2025-07-29 12:45:28 +02:00
parent 891b1e478d
commit d8381c893d
2 changed files with 353 additions and 0 deletions
--- a/src/tests/WebScraper.test.ts
+++ b/src/tests/WebScraper.test.ts
@@ -0,0 +1,210 @@
+import { WebScraper } from '../utils/WebScraper';
+import { NewsSource } from '../types/Feed';
+import { Logger } from '../utils/logger';
+
+// Mock the Logger
+jest.mock('../utils/logger', () => ({
+  Logger: {
+    error: jest.fn(),
+    warn: jest.fn(),
+    info: jest.fn(),
+    debug: jest.fn()
+  }
+}));
+
+// Mock fetch
+global.fetch = jest.fn();
+
+describe('WebScraper', () => {
+  let webScraper: WebScraper;
+  const mockFetch = fetch as jest.MockedFunction<typeof fetch>;
+
+  beforeEach(() => {
+    webScraper = new WebScraper();
+    jest.clearAllMocks();
+  });
+
+  describe('scrapeUrl', () => {
+    test('should successfully scrape a URL with complete metadata', async () => {
+      const mockHtml = `
+        <html>
+          <head>
+            <title>Test News Article</title>
+            <meta property="og:title" content="Test News Article">
+            <meta property="og:description" content="This is a test news article description">
+            <meta property="article:published_time" content="2024-01-15T10:30:00Z">
+          </head>
+          <body>
+            <h1>Test News Article</h1>
+            <p>Article content here...</p>
+          </body>
+        </html>
+      `;
+
+      mockFetch.mockResolvedValue({
+        ok: true,
+        text: () => Promise.resolve(mockHtml)
+      } as Response);
+
+      const result = await webScraper.scrapeUrl('https://example.com/news');
+
+      expect(result).toEqual({
+        title: 'Test News Article',
+        description: 'This is a test news article description',
+        url: 'https://example.com/news',
+        publishedAt: new Date('2024-01-15T10:30:00Z')
+      });
+
+      expect(mockFetch).toHaveBeenCalledWith('https://example.com/news', {
+        headers: {
+          'User-Agent': 'Mozilla/5.0 (compatible; DailyTrends/1.0)',
+          'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
+        }
+      });
+    });
+
+    test('should handle HTTP errors gracefully', async () => {
+      mockFetch.mockResolvedValue({
+        ok: false,
+        status: 404,
+        statusText: 'Not Found'
+      } as Response);
+
+      const result = await webScraper.scrapeUrl('https://example.com/not-found');
+
+      expect(result).toBeNull();
+      expect(Logger.error).toHaveBeenCalledWith(
+        'Failed to fetch https://example.com/not-found: 404 Not Found'
+      );
+    });
+
+    test('should handle network errors gracefully', async () => {
+      mockFetch.mockRejectedValue(new Error('Network error'));
+
+      const result = await webScraper.scrapeUrl('https://example.com/error');
+
+      expect(result).toBeNull();
+      expect(Logger.error).toHaveBeenCalledWith(
+        'Error scraping https://example.com/error:',
+        expect.any(Error)
+      );
+    });
+
+    test('should return null when no title is found', async () => {
+      const mockHtml = `
+        <html>
+          <head>
+            <meta property="og:description" content="Description without title">
+          </head>
+          <body>
+            <p>Content without title</p>
+          </body>
+        </html>
+      `;
+
+      mockFetch.mockResolvedValue({
+        ok: true,
+        text: () => Promise.resolve(mockHtml)
+      } as Response);
+
+      const result = await webScraper.scrapeUrl('https://example.com/no-title');
+
+      expect(result).toBeNull();
+      expect(Logger.warn).toHaveBeenCalledWith('No title found for https://example.com/no-title');
+    });
+
+    test('should return null when no description is found', async () => {
+      const mockHtml = `
+        <html>
+          <head>
+            <title>Title Only</title>
+          </head>
+          <body>
+            <p>Content without description meta</p>
+          </body>
+        </html>
+      `;
+
+      mockFetch.mockResolvedValue({
+        ok: true,
+        text: () => Promise.resolve(mockHtml)
+      } as Response);
+
+      const result = await webScraper.scrapeUrl('https://example.com/no-description');
+
+      expect(result).toBeNull();
+      expect(Logger.warn).toHaveBeenCalledWith('No description found for https://example.com/no-description');
+    });
+
+    test('should use current date when no published date is found', async () => {
+      const mockHtml = `
+        <html>
+          <head>
+            <title>Test Article</title>
+            <meta property="og:description" content="Test description">
+          </head>
+        </html>
+      `;
+
+      mockFetch.mockResolvedValue({
+        ok: true,
+        text: () => Promise.resolve(mockHtml)
+      } as Response);
+
+      const beforeScrape = new Date();
+      const result = await webScraper.scrapeUrl('https://example.com/no-date');
+      const afterScrape = new Date();
+
+      expect(result).not.toBeNull();
+      expect(result!.publishedAt.getTime()).toBeGreaterThanOrEqual(beforeScrape.getTime());
+      expect(result!.publishedAt.getTime()).toBeLessThanOrEqual(afterScrape.getTime());
+    });
+  });
+
+  describe('convertToFeedData', () => {
+    test('should convert scraped data to feed format', () => {
+    const scrapedData = {
+      title: 'Test News',
+      description: 'Test description',
+      url: 'https://example.com/news',
+      publishedAt: new Date('2024-01-15T10:00:00Z')
+    };
+    
+    const feedData = webScraper.convertToFeedData(scrapedData, NewsSource.EL_PAIS);
+    
+    expect(feedData).toEqual({
+      title: 'Test News',
+      description: 'Test description',
+      url: 'https://example.com/news',
+      source: NewsSource.EL_PAIS,
+      publishedAt: new Date('2024-01-15T10:00:00Z'),
+      isManual: false
+    });
+  });
+
+  test('should handle HTML with special characters and entities', async () => {
+    const htmlWithEntities = `
+      <html>
+        <head>
+          <title>News &amp; Updates - El Pa&iacute;s</title>
+          <meta name="description" content="Breaking news &quot;today&quot; &amp; analysis">
+        </head>
+      </html>
+    `;
+    
+    global.fetch = jest.fn().mockResolvedValue({
+      ok: true,
+      text: () => Promise.resolve(htmlWithEntities)
+    });
+    
+    const result = await webScraper.scrapeUrl('https://example.com/news');
+    
+    expect(result).toEqual({
+      title: 'News &amp; Updates - El Pa&iacute;s',
+      description: 'Breaking news &quot;today&quot; &amp; analysis',
+      url: 'https://example.com/news',
+      publishedAt: expect.any(Date)
+    });
+  });
+});
+});
--- a/src/utils/WebScraper.ts
+++ b/src/utils/WebScraper.ts
@@ -0,0 +1,143 @@
+import { IFeed, NewsSource } from '../types/Feed.js';
+import { Logger } from './logger.js';
+
+interface ScrapedData {
+  title: string;
+  description: string;
+  url: string;
+  publishedAt: Date;
+}
+
+export class WebScraper {
+  private userAgent = 'Mozilla/5.0 (compatible; DailyTrends/1.0)';
+
+  async scrapeUrl(url: string): Promise<ScrapedData | null> {
+    try {
+      const response = await fetch(url, {
+        headers: {
+          'User-Agent': this.userAgent,
+          'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8'
+        }
+      });
+
+      if (!response.ok) {
+        Logger.error(`Failed to fetch ${url}: ${response.status} ${response.statusText}`);
+        return null;
+      }
+
+      const html = await response.text();
+      return this.parseHtml(html, url);
+    } catch (error) {
+      Logger.error(`Error scraping ${url}:`, error);
+      return null;
+    }
+  }
+
+  private parseHtml(html: string, url: string): ScrapedData | null {
+    try {
+      // Extract title from <title> tag or Open Graph
+      const title = this.extractTitle(html);
+      if (!title) {
+        Logger.warn(`No title found for ${url}`);
+        return null;
+      }
+
+      // Extract description from meta tags
+      const description = this.extractDescription(html);
+      if (!description) {
+        Logger.warn(`No description found for ${url}`);
+        return null;
+      }
+
+      // Extract published date
+      const publishedAt = this.extractPublishedDate(html);
+
+      return {
+        title: title.trim(),
+        description: description.trim(),
+        url,
+        publishedAt
+      };
+    } catch (error) {
+      Logger.error(`Error parsing HTML for ${url}:`, error);
+      return null;
+    }
+  }
+
+  private extractTitle(html: string): string | null {
+    // Try Open Graph title first
+    const ogTitleMatch = html.match(/<meta\s+property=["']og:title["']\s+content=["']([^"']+)["']/i);
+    if (ogTitleMatch) {
+      return ogTitleMatch[1];
+    }
+
+    // Try Twitter title
+    const twitterTitleMatch = html.match(/<meta\s+name=["']twitter:title["']\s+content=["']([^"']+)["']/i);
+    if (twitterTitleMatch) {
+      return twitterTitleMatch[1];
+    }
+
+    // Fall back to <title> tag
+    const titleMatch = html.match(/<title[^>]*>([^<]+)<\/title>/i);
+    if (titleMatch) {
+      return titleMatch[1];
+    }
+
+    return null;
+  }
+
+  private extractDescription(html: string): string | null {
+    // Try Open Graph description first
+    const ogDescMatch = html.match(/<meta\s+property=["']og:description["']\s+content=["']([^"']+)["']/i);
+    if (ogDescMatch) {
+      return ogDescMatch[1];
+    }
+
+    // Try Twitter description
+    const twitterDescMatch = html.match(/<meta\s+name=["']twitter:description["']\s+content=["']([^"']+)["']/i);
+    if (twitterDescMatch) {
+      return twitterDescMatch[1];
+    }
+
+    // Try meta description
+    const metaDescMatch = html.match(/<meta\s+name=["']description["']\s+content=["']([^"']+)["']/i);
+    if (metaDescMatch) {
+      return metaDescMatch[1];
+    }
+
+    return null;
+  }
+
+  private extractPublishedDate(html: string): Date {
+    // Try various date formats
+    const datePatterns = [
+      /<meta\s+property=["']article:published_time["']\s+content=["']([^"']+)["']/i,
+      /<meta\s+name=["']pubdate["']\s+content=["']([^"']+)["']/i,
+      /<time[^>]+datetime=["']([^"']+)["']/i
+    ];
+
+    for (const pattern of datePatterns) {
+      const match = html.match(pattern);
+      if (match) {
+        const date = new Date(match[1]);
+        if (!isNaN(date.getTime())) {
+          return date;
+        }
+      }
+    }
+
+    // Default to current date if no published date found
+    return new Date();
+  }
+
+  convertToFeedData(scrapedData: ScrapedData, source: NewsSource): Omit<IFeed, '_id' | 'createdAt' | 'updatedAt'> {
+    return {
+      title: scrapedData.title,
+      description: scrapedData.description,
+      url: scrapedData.url,
+      source,
+      publishedAt: scrapedData.publishedAt,
+      isManual: false
+    };
+  }
+}