Initial commit: Japan Senior News Collector

- FastAPI backend with news scraping from Yahoo Japan - SQLite database for article storage - Web UI with dark mode, article modal, statistics dashboard - Docker support for containerized deployment - API endpoints: /api/today, /api/news, /api/collect-news, /api/dates, /api/download-json - Auto-collect feature when requesting today news - Content filtering for articles without body text
2025-12-15 15:55:37 +09:00
commit 56a6de61ce
11 changed files with 1358 additions and 0 deletions
--- a/scraper.py
+++ b/scraper.py
@@ -0,0 +1,167 @@
+import requests
+from bs4 import BeautifulSoup
+from typing import List, Optional
+from database import Article
+from datetime import datetime
+import time
+import random
+
+class NewsScraper:
+    BASE_URL = "https://news.yahoo.co.jp"
+    
+    CATEGORIES = {
+        "Economy": "https://news.yahoo.co.jp/categories/business",
+        "Society": "https://news.yahoo.co.jp/categories/domestic",
+        "Lifestyle": "https://news.yahoo.co.jp/categories/life",
+        "Health": "https://news.yahoo.co.jp/search?p=%E5%81%A5%E5%BA%B7&ei=utf-8"  # Search for 'Health'
+    }
+
+    HEALTH_KEYWORDS = ["健康", "医療", "病気", "病院", "医師", "薬", "ワクチン", "感染", "介護", "認知症", "老化", "ダイエット", "運動", "睡眠", "ストレス", "メンタル"]
+
+    def scrape_category(self, category_name: str, limit: int = 5) -> List[Article]:
+        url = self.CATEGORIES.get(category_name)
+        if not url:
+            print(f"Unknown category: {category_name}")
+            return []
+
+        print(f"Scraping {category_name} from {url}...")
+        try:
+            response = requests.get(url)
+            response.raise_for_status()
+            soup = BeautifulSoup(response.content, "html.parser")
+            
+            articles = []
+            
+            # Find all links that look like article links
+            # Yahoo Japan News article links typically contain 'news.yahoo.co.jp/articles/' or 'news.yahoo.co.jp/pickup/'
+            candidates = soup.find_all('a')
+            print(f"Found {len(candidates)} total links")
+            
+            seen_urls = set()
+            
+            for link in candidates:
+                if len(articles) >= limit:
+                    break
+                    
+                href = link.get('href')
+                if not href:
+                    continue
+                    
+                if 'news.yahoo.co.jp/articles/' in href or 'news.yahoo.co.jp/pickup/' in href:
+                    # Clean up URL
+                    if href.startswith('/'):
+                        href = self.BASE_URL + href
+                        
+                    if href in seen_urls:
+                        continue
+                        
+                    # Extract title
+                    title = link.get_text(strip=True)
+                    if len(title) < 5:
+                        continue
+                        
+                    if category_name == "Health":
+                        pass 
+
+                    # Image extraction
+                    img_tag = link.find('img')
+                    image_url = img_tag.get('src') if img_tag else None
+                    
+                    seen_urls.add(href)
+                    
+                    print(f"Found article: {title}")
+                    
+                    # Handle Pickup URLs - Resolve to real article URL
+                    final_url = href
+                    if "/pickup/" in href:
+                        print(f"  Resolving pickup URL: {href}")
+                        real_url = self.resolve_pickup_url(href)
+                        if real_url:
+                            print(f"  -> Resolved to: {real_url}")
+                            final_url = real_url
+                    
+                    article = Article(
+                        title=title,
+                        url=final_url,  # Store the final URL
+                        image_url=image_url,
+                        category=category_name,
+                        published_date=datetime.now().strftime("%Y-%m-%d"),
+                        collected_at=datetime.now().isoformat()
+                    )
+                    
+                    # Fetch Full Content
+                    try:
+                        print(f"  Fetching content for {title[:10]}...")
+                        content = self.scrape_article_body(final_url)
+                        article.content = content
+                        time.sleep(random.uniform(0.5, 1.5))
+                    except Exception as e:
+                        print(f"  Failed to fetch content: {e}")
+                        article.content = "Failed to load content."
+
+                    articles.append(article)
+            
+            print(f"Total articles collected for {category_name}: {len(articles)}")
+                
+        except Exception as e:
+            print(f"Error scraping {category_name}: {e}")
+            import traceback
+            traceback.print_exc()
+            return []
+
+        return articles
+
+    def resolve_pickup_url(self, pickup_url: str) -> Optional[str]:
+        try:
+            response = requests.get(pickup_url)
+            response.raise_for_status()
+            soup = BeautifulSoup(response.content, "html.parser")
+            
+            # Look for "続きを読む" link
+            link = soup.find('a', string=lambda t: t and '続きを読む' in t)
+            if link and link.get('href'):
+                return link.get('href')
+                
+            # Fallback: look for any news.yahoo.co.jp/articles/ link in the main content area
+            # Usually pickup pages have a clear link to the full story
+            candidates = soup.find_all('a')
+            for l in candidates:
+                href = l.get('href')
+                if href and 'news.yahoo.co.jp/articles/' in href:
+                    return href
+                    
+            return None
+        except Exception as e:
+            print(f"Error resolving pickup URL {pickup_url}: {e}")
+            return None
+
+    def scrape_article_body(self, url: str) -> str:
+        try:
+            response = requests.get(url)
+            response.raise_for_status()
+            soup = BeautifulSoup(response.content, "html.parser")
+            
+            # Selector identified via browser tool: div.sc-iMCRTP.eqMceQ.yjSlinkDirectLink (or generic sc-iMCRTP)
+            # Collecting all paragraphs within the main article body container
+            # We look for the container div
+            container = soup.select_one("div.sc-iMCRTP")
+            if not container:
+                # Fallback to article_body class search if specific class changes
+                container = soup.find("div", class_=lambda x: x and "article_body" in x)
+            
+            if container:
+                paragraphs = container.find_all('p')
+                text = "\n\n".join([p.get_text(strip=True) for p in paragraphs])
+                return text
+            
+            return "Content not found."
+            
+        except Exception as e:
+            print(f"Error scraping body from {url}: {e}")
+            return ""
+
+if __name__ == "__main__":
+    # Test run
+    scraper = NewsScraper()
+    news = scraper.scrape_category("Society", limit=2)
+    print(news)