Files
collect-japan-news/scraper.py
kihong.kim 56a6de61ce Initial commit: Japan Senior News Collector
- FastAPI backend with news scraping from Yahoo Japan
- SQLite database for article storage
- Web UI with dark mode, article modal, statistics dashboard
- Docker support for containerized deployment
- API endpoints: /api/today, /api/news, /api/collect-news, /api/dates, /api/download-json
- Auto-collect feature when requesting today news
- Content filtering for articles without body text
2025-12-15 15:55:37 +09:00

168 lines
6.6 KiB
Python

import requests
from bs4 import BeautifulSoup
from typing import List, Optional
from database import Article
from datetime import datetime
import time
import random
class NewsScraper:
BASE_URL = "https://news.yahoo.co.jp"
CATEGORIES = {
"Economy": "https://news.yahoo.co.jp/categories/business",
"Society": "https://news.yahoo.co.jp/categories/domestic",
"Lifestyle": "https://news.yahoo.co.jp/categories/life",
"Health": "https://news.yahoo.co.jp/search?p=%E5%81%A5%E5%BA%B7&ei=utf-8" # Search for 'Health'
}
HEALTH_KEYWORDS = ["健康", "医療", "病気", "病院", "医師", "", "ワクチン", "感染", "介護", "認知症", "老化", "ダイエット", "運動", "睡眠", "ストレス", "メンタル"]
def scrape_category(self, category_name: str, limit: int = 5) -> List[Article]:
url = self.CATEGORIES.get(category_name)
if not url:
print(f"Unknown category: {category_name}")
return []
print(f"Scraping {category_name} from {url}...")
try:
response = requests.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.content, "html.parser")
articles = []
# Find all links that look like article links
# Yahoo Japan News article links typically contain 'news.yahoo.co.jp/articles/' or 'news.yahoo.co.jp/pickup/'
candidates = soup.find_all('a')
print(f"Found {len(candidates)} total links")
seen_urls = set()
for link in candidates:
if len(articles) >= limit:
break
href = link.get('href')
if not href:
continue
if 'news.yahoo.co.jp/articles/' in href or 'news.yahoo.co.jp/pickup/' in href:
# Clean up URL
if href.startswith('/'):
href = self.BASE_URL + href
if href in seen_urls:
continue
# Extract title
title = link.get_text(strip=True)
if len(title) < 5:
continue
if category_name == "Health":
pass
# Image extraction
img_tag = link.find('img')
image_url = img_tag.get('src') if img_tag else None
seen_urls.add(href)
print(f"Found article: {title}")
# Handle Pickup URLs - Resolve to real article URL
final_url = href
if "/pickup/" in href:
print(f" Resolving pickup URL: {href}")
real_url = self.resolve_pickup_url(href)
if real_url:
print(f" -> Resolved to: {real_url}")
final_url = real_url
article = Article(
title=title,
url=final_url, # Store the final URL
image_url=image_url,
category=category_name,
published_date=datetime.now().strftime("%Y-%m-%d"),
collected_at=datetime.now().isoformat()
)
# Fetch Full Content
try:
print(f" Fetching content for {title[:10]}...")
content = self.scrape_article_body(final_url)
article.content = content
time.sleep(random.uniform(0.5, 1.5))
except Exception as e:
print(f" Failed to fetch content: {e}")
article.content = "Failed to load content."
articles.append(article)
print(f"Total articles collected for {category_name}: {len(articles)}")
except Exception as e:
print(f"Error scraping {category_name}: {e}")
import traceback
traceback.print_exc()
return []
return articles
def resolve_pickup_url(self, pickup_url: str) -> Optional[str]:
try:
response = requests.get(pickup_url)
response.raise_for_status()
soup = BeautifulSoup(response.content, "html.parser")
# Look for "続きを読む" link
link = soup.find('a', string=lambda t: t and '続きを読む' in t)
if link and link.get('href'):
return link.get('href')
# Fallback: look for any news.yahoo.co.jp/articles/ link in the main content area
# Usually pickup pages have a clear link to the full story
candidates = soup.find_all('a')
for l in candidates:
href = l.get('href')
if href and 'news.yahoo.co.jp/articles/' in href:
return href
return None
except Exception as e:
print(f"Error resolving pickup URL {pickup_url}: {e}")
return None
def scrape_article_body(self, url: str) -> str:
try:
response = requests.get(url)
response.raise_for_status()
soup = BeautifulSoup(response.content, "html.parser")
# Selector identified via browser tool: div.sc-iMCRTP.eqMceQ.yjSlinkDirectLink (or generic sc-iMCRTP)
# Collecting all paragraphs within the main article body container
# We look for the container div
container = soup.select_one("div.sc-iMCRTP")
if not container:
# Fallback to article_body class search if specific class changes
container = soup.find("div", class_=lambda x: x and "article_body" in x)
if container:
paragraphs = container.find_all('p')
text = "\n\n".join([p.get_text(strip=True) for p in paragraphs])
return text
return "Content not found."
except Exception as e:
print(f"Error scraping body from {url}: {e}")
return ""
if __name__ == "__main__":
# Test run
scraper = NewsScraper()
news = scraper.scrape_category("Society", limit=2)
print(news)