collect-japan-news/scraper.py

import requests
from bs4 import BeautifulSoup
from typing import List, Optional
from database import Article
from datetime import datetime
import time
import random

class NewsScraper:
    BASE_URL = "https://news.yahoo.co.jp"

    CATEGORIES = {
        "Economy": "https://news.yahoo.co.jp/categories/business",
        "Society": "https://news.yahoo.co.jp/categories/domestic",
        "Lifestyle": "https://news.yahoo.co.jp/categories/life",
        "Health": None,  # Special handling - uses multiple search sources
    }

    # Multiple search sources for senior-friendly health content
    HEALTH_SEARCH_SOURCES = [
        # 건강 지식
        "https://news.yahoo.co.jp/search?p=%E5%81%A5%E5%BA%B7%E6%B3%95+%E7%94%9F%E6%B4%BB%E7%BF%92%E6%85%A3&ei=utf-8",  # 健康法 生活習慣
        # 좋은 음식/영양
        "https://news.yahoo.co.jp/search?p=%E5%81%A5%E5%BA%B7+%E9%A3%9F%E4%BA%8B+%E6%A0%84%E9%A4%8A&ei=utf-8",  # 健康 食事 栄養
        # 시니어 운동/체조
        "https://news.yahoo.co.jp/search?p=%E4%BD%93%E6%93%8D+%E3%82%B9%E3%83%88%E3%83%AC%E3%83%83%E3%83%81+%E3%82%A6%E3%82%A9%E3%83%BC%E3%82%AD%E3%83%B3%E3%82%B0&ei=utf-8",  # 体操 ストレッチ ウォーキング
        # 질병 예방
        "https://news.yahoo.co.jp/search?p=%E8%AA%8D%E7%9F%A5%E7%97%87%E4%BA%88%E9%98%B2+%E8%80%81%E5%8C%96%E9%98%B2%E6%AD%A2&ei=utf-8",  # 認知症予防 老化防止
    ]

    # Keywords for senior-friendly health content
    HEALTH_POSITIVE_KEYWORDS = [
        "健康法", "栄養", "食事", "食べ物", "レシピ", "体操", "ストレッチ", "ウォーキング",
        "睡眠", "生活習慣", "予防", "免疫", "認知症予防", "筋トレ", "シニア", "高齢者",
        "長寿", "元気", "若返り", "老化防止", "サプリ", "ビタミン", "血圧", "血糖値"
    ]

    # Keywords to filter out (sports/entertainment news)
    HEALTH_EXCLUDE_KEYWORDS = [
        "選手", "試合", "優勝", "大会", "リーグ", "プロ野球", "サッカー", "相撲",
        "芸能", "俳優", "女優", "タレント", "アイドル", "ドラマ", "映画"
    ]

    def scrape_category(self, category_name: str, limit: int = 5) -> List[Article]:
        # Special handling for Health category - use multiple sources
        if category_name == "Health":
            return self._scrape_health_articles(limit)

        url = self.CATEGORIES.get(category_name)
        if not url:
            print(f"Unknown category: {category_name}")
            return []

        print(f"Scraping {category_name} from {url}...")
        try:
            response = requests.get(url)
            response.raise_for_status()
            soup = BeautifulSoup(response.content, "html.parser")

            articles = []

            # Find all links that look like article links
            # Yahoo Japan News article links typically contain 'news.yahoo.co.jp/articles/' or 'news.yahoo.co.jp/pickup/'
            candidates = soup.find_all('a')
            print(f"Found {len(candidates)} total links")

            seen_urls = set()

            for link in candidates:
                if len(articles) >= limit:
                    break

                href = link.get('href')
                if not href:
                    continue

                if 'news.yahoo.co.jp/articles/' in href or 'news.yahoo.co.jp/pickup/' in href:
                    # Clean up URL
                    if href.startswith('/'):
                        href = self.BASE_URL + href

                    if href in seen_urls:
                        continue

                    # Extract title
                    title = link.get_text(strip=True)
                    if len(title) < 5:
                        continue

                    # Filter health articles to exclude sports/entertainment
                    if category_name.startswith("Health"):
                        # Skip if title contains excluded keywords
                        if any(kw in title for kw in self.HEALTH_EXCLUDE_KEYWORDS):
                            print(f"  Skipping (excluded): {title[:30]}...")
                            continue

                    # Image extraction
                    img_tag = link.find('img')
                    image_url = img_tag.get('src') if img_tag else None

                    seen_urls.add(href)

                    print(f"Found article: {title}")

                    # Handle Pickup URLs - Resolve to real article URL
                    final_url = href
                    if "/pickup/" in href:
                        print(f"  Resolving pickup URL: {href}")
                        real_url = self.resolve_pickup_url(href)
                        if real_url:
                            print(f"  -> Resolved to: {real_url}")
                            final_url = real_url

                    article = Article(
                        title=title,
                        url=final_url,  # Store the final URL
                        image_url=image_url,
                        category=category_name,
                        published_date=datetime.now().strftime("%Y-%m-%d"),
                        collected_at=datetime.now().isoformat()
                    )

                    # Fetch Full Content
                    try:
                        print(f"  Fetching content for {title[:10]}...")
                        content = self.scrape_article_body(final_url)
                        article.content = content
                        time.sleep(random.uniform(0.5, 1.5))
                    except Exception as e:
                        print(f"  Failed to fetch content: {e}")
                        article.content = "Failed to load content."

                    articles.append(article)

            print(f"Total articles collected for {category_name}: {len(articles)}")

        except Exception as e:
            print(f"Error scraping {category_name}: {e}")
            import traceback
            traceback.print_exc()
            return []

        return articles

    def resolve_pickup_url(self, pickup_url: str) -> Optional[str]:
        try:
            response = requests.get(pickup_url)
            response.raise_for_status()
            soup = BeautifulSoup(response.content, "html.parser")

            # Look for "続きを読む" link
            link = soup.find('a', string=lambda t: t and '続きを読む' in t)
            if link and link.get('href'):
                return link.get('href')

            # Fallback: look for any news.yahoo.co.jp/articles/ link in the main content area
            # Usually pickup pages have a clear link to the full story
            candidates = soup.find_all('a')
            for l in candidates:
                href = l.get('href')
                if href and 'news.yahoo.co.jp/articles/' in href:
                    return href

            return None
        except Exception as e:
            print(f"Error resolving pickup URL {pickup_url}: {e}")
            return None

    def scrape_article_body(self, url: str) -> str:
        try:
            response = requests.get(url)
            response.raise_for_status()
            soup = BeautifulSoup(response.content, "html.parser")

            # Selector identified via browser tool: div.sc-iMCRTP.eqMceQ.yjSlinkDirectLink (or generic sc-iMCRTP)
            # Collecting all paragraphs within the main article body container
            # We look for the container div
            container = soup.select_one("div.sc-iMCRTP")
            if not container:
                # Fallback to article_body class search if specific class changes
                container = soup.find("div", class_=lambda x: x and "article_body" in x)

            if container:
                paragraphs = container.find_all('p')
                text = "\n\n".join([p.get_text(strip=True) for p in paragraphs])
                return text

            return "Content not found."

        except Exception as e:
            print(f"Error scraping body from {url}: {e}")
            return ""

    def _scrape_health_articles(self, limit: int = 5) -> List[Article]:
        """Scrape health articles from multiple sources with filtering for senior-friendly content."""
        print("Scraping Health articles from multiple sources...")

        all_articles = []
        seen_urls = set()
        articles_per_source = max(2, limit // len(self.HEALTH_SEARCH_SOURCES) + 1)

        for source_url in self.HEALTH_SEARCH_SOURCES:
            if len(all_articles) >= limit:
                break

            print(f"  Searching: {source_url[:80]}...")

            try:
                response = requests.get(source_url)
                response.raise_for_status()
                soup = BeautifulSoup(response.content, "html.parser")

                candidates = soup.find_all('a')
                source_count = 0

                for link in candidates:
                    if source_count >= articles_per_source or len(all_articles) >= limit:
                        break

                    href = link.get('href')
                    if not href:
                        continue

                    if 'news.yahoo.co.jp/articles/' in href or 'news.yahoo.co.jp/pickup/' in href:
                        if href.startswith('/'):
                            href = self.BASE_URL + href

                        if href in seen_urls:
                            continue

                        title = link.get_text(strip=True)
                        if len(title) < 5:
                            continue

                        # Filter out sports/entertainment news
                        if any(kw in title for kw in self.HEALTH_EXCLUDE_KEYWORDS):
                            print(f"    Skipping (sports/entertainment): {title[:30]}...")
                            continue

                        seen_urls.add(href)
                        print(f"    Found: {title[:40]}...")

                        # Image extraction
                        img_tag = link.find('img')
                        image_url = img_tag.get('src') if img_tag else None

                        # Resolve pickup URLs
                        final_url = href
                        if "/pickup/" in href:
                            real_url = self.resolve_pickup_url(href)
                            if real_url:
                                final_url = real_url

                        article = Article(
                            title=title,
                            url=final_url,
                            image_url=image_url,
                            category="Health",
                            published_date=datetime.now().strftime("%Y-%m-%d"),
                            collected_at=datetime.now().isoformat()
                        )

                        # Fetch content
                        try:
                            content = self.scrape_article_body(final_url)
                            article.content = content
                            time.sleep(random.uniform(0.5, 1.0))
                        except Exception as e:
                            print(f"    Failed to fetch content: {e}")
                            article.content = "Failed to load content."

                        all_articles.append(article)
                        source_count += 1

            except Exception as e:
                print(f"  Error with source {source_url[:50]}: {e}")
                continue

            time.sleep(random.uniform(0.5, 1.0))

        print(f"Total Health articles collected: {len(all_articles)}")
        return all_articles

if __name__ == "__main__":
    # Test run
    scraper = NewsScraper()
    news = scraper.scrape_category("Society", limit=2)
    print(news)