collect-japan-news/scraper.py

import requests
from bs4 import BeautifulSoup
from typing import List, Optional
from database import Article
from datetime import datetime
import time
import random

class NewsScraper:
    BASE_URL = "https://news.yahoo.co.jp"

    CATEGORIES = {
        "Economy": "https://news.yahoo.co.jp/categories/business",
        "Society": "https://news.yahoo.co.jp/categories/domestic",
        "Lifestyle": "https://news.yahoo.co.jp/categories/life",
        "Health": "https://news.yahoo.co.jp/search?p=%E5%81%A5%E5%BA%B7&ei=utf-8"  # Search for 'Health'
    }

    HEALTH_KEYWORDS = ["健康", "医療", "病気", "病院", "医師", "薬", "ワクチン", "感染", "介護", "認知症", "老化", "ダイエット", "運動", "睡眠", "ストレス", "メンタル"]

    def scrape_category(self, category_name: str, limit: int = 5) -> List[Article]:
        url = self.CATEGORIES.get(category_name)
        if not url:
            print(f"Unknown category: {category_name}")
            return []

        print(f"Scraping {category_name} from {url}...")
        try:
            response = requests.get(url)
            response.raise_for_status()
            soup = BeautifulSoup(response.content, "html.parser")

            articles = []

            # Find all links that look like article links
            # Yahoo Japan News article links typically contain 'news.yahoo.co.jp/articles/' or 'news.yahoo.co.jp/pickup/'
            candidates = soup.find_all('a')
            print(f"Found {len(candidates)} total links")

            seen_urls = set()

            for link in candidates:
                if len(articles) >= limit:
                    break

                href = link.get('href')
                if not href:
                    continue

                if 'news.yahoo.co.jp/articles/' in href or 'news.yahoo.co.jp/pickup/' in href:
                    # Clean up URL
                    if href.startswith('/'):
                        href = self.BASE_URL + href

                    if href in seen_urls:
                        continue

                    # Extract title
                    title = link.get_text(strip=True)
                    if len(title) < 5:
                        continue

                    if category_name == "Health":
                        pass

                    # Image extraction
                    img_tag = link.find('img')
                    image_url = img_tag.get('src') if img_tag else None

                    seen_urls.add(href)

                    print(f"Found article: {title}")

                    # Handle Pickup URLs - Resolve to real article URL
                    final_url = href
                    if "/pickup/" in href:
                        print(f"  Resolving pickup URL: {href}")
                        real_url = self.resolve_pickup_url(href)
                        if real_url:
                            print(f"  -> Resolved to: {real_url}")
                            final_url = real_url

                    article = Article(
                        title=title,
                        url=final_url,  # Store the final URL
                        image_url=image_url,
                        category=category_name,
                        published_date=datetime.now().strftime("%Y-%m-%d"),
                        collected_at=datetime.now().isoformat()
                    )

                    # Fetch Full Content
                    try:
                        print(f"  Fetching content for {title[:10]}...")
                        content = self.scrape_article_body(final_url)
                        article.content = content
                        time.sleep(random.uniform(0.5, 1.5))
                    except Exception as e:
                        print(f"  Failed to fetch content: {e}")
                        article.content = "Failed to load content."

                    articles.append(article)

            print(f"Total articles collected for {category_name}: {len(articles)}")

        except Exception as e:
            print(f"Error scraping {category_name}: {e}")
            import traceback
            traceback.print_exc()
            return []

        return articles

    def resolve_pickup_url(self, pickup_url: str) -> Optional[str]:
        try:
            response = requests.get(pickup_url)
            response.raise_for_status()
            soup = BeautifulSoup(response.content, "html.parser")

            # Look for "続きを読む" link
            link = soup.find('a', string=lambda t: t and '続きを読む' in t)
            if link and link.get('href'):
                return link.get('href')

            # Fallback: look for any news.yahoo.co.jp/articles/ link in the main content area
            # Usually pickup pages have a clear link to the full story
            candidates = soup.find_all('a')
            for l in candidates:
                href = l.get('href')
                if href and 'news.yahoo.co.jp/articles/' in href:
                    return href

            return None
        except Exception as e:
            print(f"Error resolving pickup URL {pickup_url}: {e}")
            return None

    def scrape_article_body(self, url: str) -> str:
        try:
            response = requests.get(url)
            response.raise_for_status()
            soup = BeautifulSoup(response.content, "html.parser")

            # Selector identified via browser tool: div.sc-iMCRTP.eqMceQ.yjSlinkDirectLink (or generic sc-iMCRTP)
            # Collecting all paragraphs within the main article body container
            # We look for the container div
            container = soup.select_one("div.sc-iMCRTP")
            if not container:
                # Fallback to article_body class search if specific class changes
                container = soup.find("div", class_=lambda x: x and "article_body" in x)

            if container:
                paragraphs = container.find_all('p')
                text = "\n\n".join([p.get_text(strip=True) for p in paragraphs])
                return text

            return "Content not found."

        except Exception as e:
            print(f"Error scraping body from {url}: {e}")
            return ""

if __name__ == "__main__":
    # Test run
    scraper = NewsScraper()
    news = scraper.scrape_category("Society", limit=2)
    print(news)