import requests from bs4 import BeautifulSoup from typing import List, Optional from database import Article from datetime import datetime import time import random class NewsScraper: BASE_URL = "https://news.yahoo.co.jp" CATEGORIES = { "Economy": "https://news.yahoo.co.jp/categories/business", "Society": "https://news.yahoo.co.jp/categories/domestic", "Lifestyle": "https://news.yahoo.co.jp/categories/life", "Health": None, # Special handling - uses multiple search sources } # Multiple search sources for senior-friendly health content HEALTH_SEARCH_SOURCES = [ # 건강 지식 "https://news.yahoo.co.jp/search?p=%E5%81%A5%E5%BA%B7%E6%B3%95+%E7%94%9F%E6%B4%BB%E7%BF%92%E6%85%A3&ei=utf-8", # 健康法 生活習慣 # 좋은 음식/영양 "https://news.yahoo.co.jp/search?p=%E5%81%A5%E5%BA%B7+%E9%A3%9F%E4%BA%8B+%E6%A0%84%E9%A4%8A&ei=utf-8", # 健康 食事 栄養 # 시니어 운동/체조 "https://news.yahoo.co.jp/search?p=%E4%BD%93%E6%93%8D+%E3%82%B9%E3%83%88%E3%83%AC%E3%83%83%E3%83%81+%E3%82%A6%E3%82%A9%E3%83%BC%E3%82%AD%E3%83%B3%E3%82%B0&ei=utf-8", # 体操 ストレッチ ウォーキング # 질병 예방 "https://news.yahoo.co.jp/search?p=%E8%AA%8D%E7%9F%A5%E7%97%87%E4%BA%88%E9%98%B2+%E8%80%81%E5%8C%96%E9%98%B2%E6%AD%A2&ei=utf-8", # 認知症予防 老化防止 ] # Keywords for senior-friendly health content HEALTH_POSITIVE_KEYWORDS = [ "健康法", "栄養", "食事", "食べ物", "レシピ", "体操", "ストレッチ", "ウォーキング", "睡眠", "生活習慣", "予防", "免疫", "認知症予防", "筋トレ", "シニア", "高齢者", "長寿", "元気", "若返り", "老化防止", "サプリ", "ビタミン", "血圧", "血糖値" ] # Keywords to filter out (sports/entertainment news) HEALTH_EXCLUDE_KEYWORDS = [ "選手", "試合", "優勝", "大会", "リーグ", "プロ野球", "サッカー", "相撲", "芸能", "俳優", "女優", "タレント", "アイドル", "ドラマ", "映画" ] def scrape_category(self, category_name: str, limit: int = 5) -> List[Article]: # Special handling for Health category - use multiple sources if category_name == "Health": return self._scrape_health_articles(limit) url = self.CATEGORIES.get(category_name) if not url: print(f"Unknown category: {category_name}") return [] print(f"Scraping {category_name} from {url}...") try: response = requests.get(url) response.raise_for_status() soup = BeautifulSoup(response.content, "html.parser") articles = [] # Find all links that look like article links # Yahoo Japan News article links typically contain 'news.yahoo.co.jp/articles/' or 'news.yahoo.co.jp/pickup/' candidates = soup.find_all('a') print(f"Found {len(candidates)} total links") seen_urls = set() for link in candidates: if len(articles) >= limit: break href = link.get('href') if not href: continue if 'news.yahoo.co.jp/articles/' in href or 'news.yahoo.co.jp/pickup/' in href: # Clean up URL if href.startswith('/'): href = self.BASE_URL + href if href in seen_urls: continue # Extract title title = link.get_text(strip=True) if len(title) < 5: continue # Filter health articles to exclude sports/entertainment if category_name.startswith("Health"): # Skip if title contains excluded keywords if any(kw in title for kw in self.HEALTH_EXCLUDE_KEYWORDS): print(f" Skipping (excluded): {title[:30]}...") continue # Image extraction img_tag = link.find('img') image_url = img_tag.get('src') if img_tag else None seen_urls.add(href) print(f"Found article: {title}") # Handle Pickup URLs - Resolve to real article URL final_url = href if "/pickup/" in href: print(f" Resolving pickup URL: {href}") real_url = self.resolve_pickup_url(href) if real_url: print(f" -> Resolved to: {real_url}") final_url = real_url article = Article( title=title, url=final_url, # Store the final URL image_url=image_url, category=category_name, published_date=datetime.now().strftime("%Y-%m-%d"), collected_at=datetime.now().isoformat() ) # Fetch Full Content try: print(f" Fetching content for {title[:10]}...") content = self.scrape_article_body(final_url) article.content = content time.sleep(random.uniform(0.5, 1.5)) except Exception as e: print(f" Failed to fetch content: {e}") article.content = "Failed to load content." articles.append(article) print(f"Total articles collected for {category_name}: {len(articles)}") except Exception as e: print(f"Error scraping {category_name}: {e}") import traceback traceback.print_exc() return [] return articles def resolve_pickup_url(self, pickup_url: str) -> Optional[str]: try: response = requests.get(pickup_url) response.raise_for_status() soup = BeautifulSoup(response.content, "html.parser") # Look for "続きを読む" link link = soup.find('a', string=lambda t: t and '続きを読む' in t) if link and link.get('href'): return link.get('href') # Fallback: look for any news.yahoo.co.jp/articles/ link in the main content area # Usually pickup pages have a clear link to the full story candidates = soup.find_all('a') for l in candidates: href = l.get('href') if href and 'news.yahoo.co.jp/articles/' in href: return href return None except Exception as e: print(f"Error resolving pickup URL {pickup_url}: {e}") return None def scrape_article_body(self, url: str) -> str: try: response = requests.get(url) response.raise_for_status() soup = BeautifulSoup(response.content, "html.parser") # Selector identified via browser tool: div.sc-iMCRTP.eqMceQ.yjSlinkDirectLink (or generic sc-iMCRTP) # Collecting all paragraphs within the main article body container # We look for the container div container = soup.select_one("div.sc-iMCRTP") if not container: # Fallback to article_body class search if specific class changes container = soup.find("div", class_=lambda x: x and "article_body" in x) if container: paragraphs = container.find_all('p') text = "\n\n".join([p.get_text(strip=True) for p in paragraphs]) return text return "Content not found." except Exception as e: print(f"Error scraping body from {url}: {e}") return "" def _scrape_health_articles(self, limit: int = 5) -> List[Article]: """Scrape health articles from multiple sources with filtering for senior-friendly content.""" print("Scraping Health articles from multiple sources...") all_articles = [] seen_urls = set() articles_per_source = max(2, limit // len(self.HEALTH_SEARCH_SOURCES) + 1) for source_url in self.HEALTH_SEARCH_SOURCES: if len(all_articles) >= limit: break print(f" Searching: {source_url[:80]}...") try: response = requests.get(source_url) response.raise_for_status() soup = BeautifulSoup(response.content, "html.parser") candidates = soup.find_all('a') source_count = 0 for link in candidates: if source_count >= articles_per_source or len(all_articles) >= limit: break href = link.get('href') if not href: continue if 'news.yahoo.co.jp/articles/' in href or 'news.yahoo.co.jp/pickup/' in href: if href.startswith('/'): href = self.BASE_URL + href if href in seen_urls: continue title = link.get_text(strip=True) if len(title) < 5: continue # Filter out sports/entertainment news if any(kw in title for kw in self.HEALTH_EXCLUDE_KEYWORDS): print(f" Skipping (sports/entertainment): {title[:30]}...") continue seen_urls.add(href) print(f" Found: {title[:40]}...") # Image extraction img_tag = link.find('img') image_url = img_tag.get('src') if img_tag else None # Resolve pickup URLs final_url = href if "/pickup/" in href: real_url = self.resolve_pickup_url(href) if real_url: final_url = real_url article = Article( title=title, url=final_url, image_url=image_url, category="Health", published_date=datetime.now().strftime("%Y-%m-%d"), collected_at=datetime.now().isoformat() ) # Fetch content try: content = self.scrape_article_body(final_url) article.content = content time.sleep(random.uniform(0.5, 1.0)) except Exception as e: print(f" Failed to fetch content: {e}") article.content = "Failed to load content." all_articles.append(article) source_count += 1 except Exception as e: print(f" Error with source {source_url[:50]}: {e}") continue time.sleep(random.uniform(0.5, 1.0)) print(f"Total Health articles collected: {len(all_articles)}") return all_articles if __name__ == "__main__": # Test run scraper = NewsScraper() news = scraper.scrape_category("Society", limit=2) print(news)