import requests from bs4 import BeautifulSoup from typing import List, Optional from database import Article from datetime import datetime import time import random class NewsScraper: BASE_URL = "https://news.yahoo.co.jp" CATEGORIES = { "Economy": "https://news.yahoo.co.jp/categories/business", "Society": "https://news.yahoo.co.jp/categories/domestic", "Lifestyle": "https://news.yahoo.co.jp/categories/life", "Health": "https://news.yahoo.co.jp/search?p=%E5%81%A5%E5%BA%B7&ei=utf-8" # Search for 'Health' } HEALTH_KEYWORDS = ["健康", "医療", "病気", "病院", "医師", "薬", "ワクチン", "感染", "介護", "認知症", "老化", "ダイエット", "運動", "睡眠", "ストレス", "メンタル"] def scrape_category(self, category_name: str, limit: int = 5) -> List[Article]: url = self.CATEGORIES.get(category_name) if not url: print(f"Unknown category: {category_name}") return [] print(f"Scraping {category_name} from {url}...") try: response = requests.get(url) response.raise_for_status() soup = BeautifulSoup(response.content, "html.parser") articles = [] # Find all links that look like article links # Yahoo Japan News article links typically contain 'news.yahoo.co.jp/articles/' or 'news.yahoo.co.jp/pickup/' candidates = soup.find_all('a') print(f"Found {len(candidates)} total links") seen_urls = set() for link in candidates: if len(articles) >= limit: break href = link.get('href') if not href: continue if 'news.yahoo.co.jp/articles/' in href or 'news.yahoo.co.jp/pickup/' in href: # Clean up URL if href.startswith('/'): href = self.BASE_URL + href if href in seen_urls: continue # Extract title title = link.get_text(strip=True) if len(title) < 5: continue if category_name == "Health": pass # Image extraction img_tag = link.find('img') image_url = img_tag.get('src') if img_tag else None seen_urls.add(href) print(f"Found article: {title}") # Handle Pickup URLs - Resolve to real article URL final_url = href if "/pickup/" in href: print(f" Resolving pickup URL: {href}") real_url = self.resolve_pickup_url(href) if real_url: print(f" -> Resolved to: {real_url}") final_url = real_url article = Article( title=title, url=final_url, # Store the final URL image_url=image_url, category=category_name, published_date=datetime.now().strftime("%Y-%m-%d"), collected_at=datetime.now().isoformat() ) # Fetch Full Content try: print(f" Fetching content for {title[:10]}...") content = self.scrape_article_body(final_url) article.content = content time.sleep(random.uniform(0.5, 1.5)) except Exception as e: print(f" Failed to fetch content: {e}") article.content = "Failed to load content." articles.append(article) print(f"Total articles collected for {category_name}: {len(articles)}") except Exception as e: print(f"Error scraping {category_name}: {e}") import traceback traceback.print_exc() return [] return articles def resolve_pickup_url(self, pickup_url: str) -> Optional[str]: try: response = requests.get(pickup_url) response.raise_for_status() soup = BeautifulSoup(response.content, "html.parser") # Look for "続きを読む" link link = soup.find('a', string=lambda t: t and '続きを読む' in t) if link and link.get('href'): return link.get('href') # Fallback: look for any news.yahoo.co.jp/articles/ link in the main content area # Usually pickup pages have a clear link to the full story candidates = soup.find_all('a') for l in candidates: href = l.get('href') if href and 'news.yahoo.co.jp/articles/' in href: return href return None except Exception as e: print(f"Error resolving pickup URL {pickup_url}: {e}") return None def scrape_article_body(self, url: str) -> str: try: response = requests.get(url) response.raise_for_status() soup = BeautifulSoup(response.content, "html.parser") # Selector identified via browser tool: div.sc-iMCRTP.eqMceQ.yjSlinkDirectLink (or generic sc-iMCRTP) # Collecting all paragraphs within the main article body container # We look for the container div container = soup.select_one("div.sc-iMCRTP") if not container: # Fallback to article_body class search if specific class changes container = soup.find("div", class_=lambda x: x and "article_body" in x) if container: paragraphs = container.find_all('p') text = "\n\n".join([p.get_text(strip=True) for p in paragraphs]) return text return "Content not found." except Exception as e: print(f"Error scraping body from {url}: {e}") return "" if __name__ == "__main__": # Test run scraper = NewsScraper() news = scraper.scrape_category("Society", limit=2) print(news)