Initial commit: Japan Senior News Collector
- FastAPI backend with news scraping from Yahoo Japan - SQLite database for article storage - Web UI with dark mode, article modal, statistics dashboard - Docker support for containerized deployment - API endpoints: /api/today, /api/news, /api/collect-news, /api/dates, /api/download-json - Auto-collect feature when requesting today news - Content filtering for articles without body text
This commit is contained in:
167
scraper.py
Normal file
167
scraper.py
Normal file
@@ -0,0 +1,167 @@
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
from typing import List, Optional
|
||||
from database import Article
|
||||
from datetime import datetime
|
||||
import time
|
||||
import random
|
||||
|
||||
class NewsScraper:
|
||||
BASE_URL = "https://news.yahoo.co.jp"
|
||||
|
||||
CATEGORIES = {
|
||||
"Economy": "https://news.yahoo.co.jp/categories/business",
|
||||
"Society": "https://news.yahoo.co.jp/categories/domestic",
|
||||
"Lifestyle": "https://news.yahoo.co.jp/categories/life",
|
||||
"Health": "https://news.yahoo.co.jp/search?p=%E5%81%A5%E5%BA%B7&ei=utf-8" # Search for 'Health'
|
||||
}
|
||||
|
||||
HEALTH_KEYWORDS = ["健康", "医療", "病気", "病院", "医師", "薬", "ワクチン", "感染", "介護", "認知症", "老化", "ダイエット", "運動", "睡眠", "ストレス", "メンタル"]
|
||||
|
||||
def scrape_category(self, category_name: str, limit: int = 5) -> List[Article]:
|
||||
url = self.CATEGORIES.get(category_name)
|
||||
if not url:
|
||||
print(f"Unknown category: {category_name}")
|
||||
return []
|
||||
|
||||
print(f"Scraping {category_name} from {url}...")
|
||||
try:
|
||||
response = requests.get(url)
|
||||
response.raise_for_status()
|
||||
soup = BeautifulSoup(response.content, "html.parser")
|
||||
|
||||
articles = []
|
||||
|
||||
# Find all links that look like article links
|
||||
# Yahoo Japan News article links typically contain 'news.yahoo.co.jp/articles/' or 'news.yahoo.co.jp/pickup/'
|
||||
candidates = soup.find_all('a')
|
||||
print(f"Found {len(candidates)} total links")
|
||||
|
||||
seen_urls = set()
|
||||
|
||||
for link in candidates:
|
||||
if len(articles) >= limit:
|
||||
break
|
||||
|
||||
href = link.get('href')
|
||||
if not href:
|
||||
continue
|
||||
|
||||
if 'news.yahoo.co.jp/articles/' in href or 'news.yahoo.co.jp/pickup/' in href:
|
||||
# Clean up URL
|
||||
if href.startswith('/'):
|
||||
href = self.BASE_URL + href
|
||||
|
||||
if href in seen_urls:
|
||||
continue
|
||||
|
||||
# Extract title
|
||||
title = link.get_text(strip=True)
|
||||
if len(title) < 5:
|
||||
continue
|
||||
|
||||
if category_name == "Health":
|
||||
pass
|
||||
|
||||
# Image extraction
|
||||
img_tag = link.find('img')
|
||||
image_url = img_tag.get('src') if img_tag else None
|
||||
|
||||
seen_urls.add(href)
|
||||
|
||||
print(f"Found article: {title}")
|
||||
|
||||
# Handle Pickup URLs - Resolve to real article URL
|
||||
final_url = href
|
||||
if "/pickup/" in href:
|
||||
print(f" Resolving pickup URL: {href}")
|
||||
real_url = self.resolve_pickup_url(href)
|
||||
if real_url:
|
||||
print(f" -> Resolved to: {real_url}")
|
||||
final_url = real_url
|
||||
|
||||
article = Article(
|
||||
title=title,
|
||||
url=final_url, # Store the final URL
|
||||
image_url=image_url,
|
||||
category=category_name,
|
||||
published_date=datetime.now().strftime("%Y-%m-%d"),
|
||||
collected_at=datetime.now().isoformat()
|
||||
)
|
||||
|
||||
# Fetch Full Content
|
||||
try:
|
||||
print(f" Fetching content for {title[:10]}...")
|
||||
content = self.scrape_article_body(final_url)
|
||||
article.content = content
|
||||
time.sleep(random.uniform(0.5, 1.5))
|
||||
except Exception as e:
|
||||
print(f" Failed to fetch content: {e}")
|
||||
article.content = "Failed to load content."
|
||||
|
||||
articles.append(article)
|
||||
|
||||
print(f"Total articles collected for {category_name}: {len(articles)}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error scraping {category_name}: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
return []
|
||||
|
||||
return articles
|
||||
|
||||
def resolve_pickup_url(self, pickup_url: str) -> Optional[str]:
|
||||
try:
|
||||
response = requests.get(pickup_url)
|
||||
response.raise_for_status()
|
||||
soup = BeautifulSoup(response.content, "html.parser")
|
||||
|
||||
# Look for "続きを読む" link
|
||||
link = soup.find('a', string=lambda t: t and '続きを読む' in t)
|
||||
if link and link.get('href'):
|
||||
return link.get('href')
|
||||
|
||||
# Fallback: look for any news.yahoo.co.jp/articles/ link in the main content area
|
||||
# Usually pickup pages have a clear link to the full story
|
||||
candidates = soup.find_all('a')
|
||||
for l in candidates:
|
||||
href = l.get('href')
|
||||
if href and 'news.yahoo.co.jp/articles/' in href:
|
||||
return href
|
||||
|
||||
return None
|
||||
except Exception as e:
|
||||
print(f"Error resolving pickup URL {pickup_url}: {e}")
|
||||
return None
|
||||
|
||||
def scrape_article_body(self, url: str) -> str:
|
||||
try:
|
||||
response = requests.get(url)
|
||||
response.raise_for_status()
|
||||
soup = BeautifulSoup(response.content, "html.parser")
|
||||
|
||||
# Selector identified via browser tool: div.sc-iMCRTP.eqMceQ.yjSlinkDirectLink (or generic sc-iMCRTP)
|
||||
# Collecting all paragraphs within the main article body container
|
||||
# We look for the container div
|
||||
container = soup.select_one("div.sc-iMCRTP")
|
||||
if not container:
|
||||
# Fallback to article_body class search if specific class changes
|
||||
container = soup.find("div", class_=lambda x: x and "article_body" in x)
|
||||
|
||||
if container:
|
||||
paragraphs = container.find_all('p')
|
||||
text = "\n\n".join([p.get_text(strip=True) for p in paragraphs])
|
||||
return text
|
||||
|
||||
return "Content not found."
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error scraping body from {url}: {e}")
|
||||
return ""
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Test run
|
||||
scraper = NewsScraper()
|
||||
news = scraper.scrape_category("Society", limit=2)
|
||||
print(news)
|
||||
Reference in New Issue
Block a user