Files
collect-japan-news/scraper.py
kihong.kim 2d23d8dfc3 Improve Health category with senior-friendly content sources
- Replace single "健康" search with 4 specialized search sources:
  - 健康法 生活習慣 (health tips)
  - 健康 食事 栄養 (nutrition/food)
  - 体操 ストレッチ ウォーキング (exercise)
  - 認知症予防 老化防止 (prevention)
- Add sports/entertainment keyword filter to exclude irrelevant articles
- Collect articles from multiple sources for diverse senior health content

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2025-12-15 17:08:37 +09:00

289 lines
12 KiB
Python

import requests
from bs4 import BeautifulSoup
from typing import List, Optional
from database import Article
from datetime import datetime
import time
import random
class NewsScraper:
BASE_URL = "https://news.yahoo.co.jp"
CATEGORIES = {
"Economy": "https://news.yahoo.co.jp/categories/business",
"Society": "https://news.yahoo.co.jp/categories/domestic",
"Lifestyle": "https://news.yahoo.co.jp/categories/life",
"Health": None, # Special handling - uses multiple search sources
}
# Multiple search sources for senior-friendly health content
HEALTH_SEARCH_SOURCES = [
# 건강 지식
"https://news.yahoo.co.jp/search?p=%E5%81%A5%E5%BA%B7%E6%B3%95+%E7%94%9F%E6%B4%BB%E7%BF%92%E6%85%A3&ei=utf-8", # 健康法 生活習慣
# 좋은 음식/영양
"https://news.yahoo.co.jp/search?p=%E5%81%A5%E5%BA%B7+%E9%A3%9F%E4%BA%8B+%E6%A0%84%E9%A4%8A&ei=utf-8", # 健康 食事 栄養
# 시니어 운동/체조
"https://news.yahoo.co.jp/search?p=%E4%BD%93%E6%93%8D+%E3%82%B9%E3%83%88%E3%83%AC%E3%83%83%E3%83%81+%E3%82%A6%E3%82%A9%E3%83%BC%E3%82%AD%E3%83%B3%E3%82%B0&ei=utf-8", # 体操 ストレッチ ウォーキング
# 질병 예방
"https://news.yahoo.co.jp/search?p=%E8%AA%8D%E7%9F%A5%E7%97%87%E4%BA%88%E9%98%B2+%E8%80%81%E5%8C%96%E9%98%B2%E6%AD%A2&ei=utf-8", # 認知症予防 老化防止
]
# Keywords for senior-friendly health content
HEALTH_POSITIVE_KEYWORDS = [
"健康法", "栄養", "食事", "食べ物", "レシピ", "体操", "ストレッチ", "ウォーキング",
"睡眠", "生活習慣", "予防", "免疫", "認知症予防", "筋トレ", "シニア", "高齢者",
"長寿", "元気", "若返り", "老化防止", "サプリ", "ビタミン", "血圧", "血糖値"
]
# Keywords to filter out (sports/entertainment news)
HEALTH_EXCLUDE_KEYWORDS = [
"選手", "試合", "優勝", "大会", "リーグ", "プロ野球", "サッカー", "相撲",
"芸能", "俳優", "女優", "タレント", "アイドル", "ドラマ", "映画"
]
def scrape_category(self, category_name: str, limit: int = 5) -> List[Article]:
# Special handling for Health category - use multiple sources
if category_name == "Health":
return self._scrape_health_articles(limit)
url = self.CATEGORIES.get(category_name)
if not url:
print(f"Unknown category: {category_name}")
return []
print(f"Scraping {category_name} from {url}...")
try:
response = requests.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.content, "html.parser")
articles = []
# Find all links that look like article links
# Yahoo Japan News article links typically contain 'news.yahoo.co.jp/articles/' or 'news.yahoo.co.jp/pickup/'
candidates = soup.find_all('a')
print(f"Found {len(candidates)} total links")
seen_urls = set()
for link in candidates:
if len(articles) >= limit:
break
href = link.get('href')
if not href:
continue
if 'news.yahoo.co.jp/articles/' in href or 'news.yahoo.co.jp/pickup/' in href:
# Clean up URL
if href.startswith('/'):
href = self.BASE_URL + href
if href in seen_urls:
continue
# Extract title
title = link.get_text(strip=True)
if len(title) < 5:
continue
# Filter health articles to exclude sports/entertainment
if category_name.startswith("Health"):
# Skip if title contains excluded keywords
if any(kw in title for kw in self.HEALTH_EXCLUDE_KEYWORDS):
print(f" Skipping (excluded): {title[:30]}...")
continue
# Image extraction
img_tag = link.find('img')
image_url = img_tag.get('src') if img_tag else None
seen_urls.add(href)
print(f"Found article: {title}")
# Handle Pickup URLs - Resolve to real article URL
final_url = href
if "/pickup/" in href:
print(f" Resolving pickup URL: {href}")
real_url = self.resolve_pickup_url(href)
if real_url:
print(f" -> Resolved to: {real_url}")
final_url = real_url
article = Article(
title=title,
url=final_url, # Store the final URL
image_url=image_url,
category=category_name,
published_date=datetime.now().strftime("%Y-%m-%d"),
collected_at=datetime.now().isoformat()
)
# Fetch Full Content
try:
print(f" Fetching content for {title[:10]}...")
content = self.scrape_article_body(final_url)
article.content = content
time.sleep(random.uniform(0.5, 1.5))
except Exception as e:
print(f" Failed to fetch content: {e}")
article.content = "Failed to load content."
articles.append(article)
print(f"Total articles collected for {category_name}: {len(articles)}")
except Exception as e:
print(f"Error scraping {category_name}: {e}")
import traceback
traceback.print_exc()
return []
return articles
def resolve_pickup_url(self, pickup_url: str) -> Optional[str]:
try:
response = requests.get(pickup_url)
response.raise_for_status()
soup = BeautifulSoup(response.content, "html.parser")
# Look for "続きを読む" link
link = soup.find('a', string=lambda t: t and '続きを読む' in t)
if link and link.get('href'):
return link.get('href')
# Fallback: look for any news.yahoo.co.jp/articles/ link in the main content area
# Usually pickup pages have a clear link to the full story
candidates = soup.find_all('a')
for l in candidates:
href = l.get('href')
if href and 'news.yahoo.co.jp/articles/' in href:
return href
return None
except Exception as e:
print(f"Error resolving pickup URL {pickup_url}: {e}")
return None
def scrape_article_body(self, url: str) -> str:
try:
response = requests.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.content, "html.parser")
# Selector identified via browser tool: div.sc-iMCRTP.eqMceQ.yjSlinkDirectLink (or generic sc-iMCRTP)
# Collecting all paragraphs within the main article body container
# We look for the container div
container = soup.select_one("div.sc-iMCRTP")
if not container:
# Fallback to article_body class search if specific class changes
container = soup.find("div", class_=lambda x: x and "article_body" in x)
if container:
paragraphs = container.find_all('p')
text = "\n\n".join([p.get_text(strip=True) for p in paragraphs])
return text
return "Content not found."
except Exception as e:
print(f"Error scraping body from {url}: {e}")
return ""
def _scrape_health_articles(self, limit: int = 5) -> List[Article]:
"""Scrape health articles from multiple sources with filtering for senior-friendly content."""
print("Scraping Health articles from multiple sources...")
all_articles = []
seen_urls = set()
articles_per_source = max(2, limit // len(self.HEALTH_SEARCH_SOURCES) + 1)
for source_url in self.HEALTH_SEARCH_SOURCES:
if len(all_articles) >= limit:
break
print(f" Searching: {source_url[:80]}...")
try:
response = requests.get(source_url)
response.raise_for_status()
soup = BeautifulSoup(response.content, "html.parser")
candidates = soup.find_all('a')
source_count = 0
for link in candidates:
if source_count >= articles_per_source or len(all_articles) >= limit:
break
href = link.get('href')
if not href:
continue
if 'news.yahoo.co.jp/articles/' in href or 'news.yahoo.co.jp/pickup/' in href:
if href.startswith('/'):
href = self.BASE_URL + href
if href in seen_urls:
continue
title = link.get_text(strip=True)
if len(title) < 5:
continue
# Filter out sports/entertainment news
if any(kw in title for kw in self.HEALTH_EXCLUDE_KEYWORDS):
print(f" Skipping (sports/entertainment): {title[:30]}...")
continue
seen_urls.add(href)
print(f" Found: {title[:40]}...")
# Image extraction
img_tag = link.find('img')
image_url = img_tag.get('src') if img_tag else None
# Resolve pickup URLs
final_url = href
if "/pickup/" in href:
real_url = self.resolve_pickup_url(href)
if real_url:
final_url = real_url
article = Article(
title=title,
url=final_url,
image_url=image_url,
category="Health",
published_date=datetime.now().strftime("%Y-%m-%d"),
collected_at=datetime.now().isoformat()
)
# Fetch content
try:
content = self.scrape_article_body(final_url)
article.content = content
time.sleep(random.uniform(0.5, 1.0))
except Exception as e:
print(f" Failed to fetch content: {e}")
article.content = "Failed to load content."
all_articles.append(article)
source_count += 1
except Exception as e:
print(f" Error with source {source_url[:50]}: {e}")
continue
time.sleep(random.uniform(0.5, 1.0))
print(f"Total Health articles collected: {len(all_articles)}")
return all_articles
if __name__ == "__main__":
# Test run
scraper = NewsScraper()
news = scraper.scrape_category("Society", limit=2)
print(news)