From 56a6de61ce000bfe3b8c83eca01ed76d114077ee Mon Sep 17 00:00:00 2001 From: "kihong.kim" Date: Mon, 15 Dec 2025 15:55:37 +0900 Subject: [PATCH] Initial commit: Japan Senior News Collector - FastAPI backend with news scraping from Yahoo Japan - SQLite database for article storage - Web UI with dark mode, article modal, statistics dashboard - Docker support for containerized deployment - API endpoints: /api/today, /api/news, /api/collect-news, /api/dates, /api/download-json - Auto-collect feature when requesting today news - Content filtering for articles without body text --- .dockerignore | 7 + .gitignore | 24 ++ DEPLOY.md | 52 ++++ Dockerfile | 15 + README.md | 181 ++++++++++++ database.py | 103 +++++++ docker-compose.yml | 11 + main.py | 126 +++++++++ requirements.txt | 5 + scraper.py | 167 ++++++++++++ static/index.html | 667 +++++++++++++++++++++++++++++++++++++++++++++ 11 files changed, 1358 insertions(+) create mode 100644 .dockerignore create mode 100644 .gitignore create mode 100644 DEPLOY.md create mode 100644 Dockerfile create mode 100644 README.md create mode 100644 database.py create mode 100644 docker-compose.yml create mode 100644 main.py create mode 100644 requirements.txt create mode 100644 scraper.py create mode 100644 static/index.html diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000..5c825ff --- /dev/null +++ b/.dockerignore @@ -0,0 +1,7 @@ +venv/ +__pycache__/ +*.pyc +*.pyo +.git/ +.gitignore +*.db diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..45c908b --- /dev/null +++ b/.gitignore @@ -0,0 +1,24 @@ +# Python +__pycache__/ +*.py[cod] +*$py.class +*.so +venv/ +.env + +# Database +*.db + +# IDE +.vscode/ +.idea/ + +# Debug/Test files +debug_output.txt +test_*.py + +# Generated files +*.json + +# OS +.DS_Store diff --git a/DEPLOY.md b/DEPLOY.md new file mode 100644 index 0000000..fe91dff --- /dev/null +++ b/DEPLOY.md @@ -0,0 +1,52 @@ +# Deployment Guide for Japan News Collector + +This guide explains how to deploy the application using Docker on your home server or any machine with Docker installed. + +## Prerequisites + +- [Docker](https://docs.docker.com/get-docker/) installed. +- [Docker Compose](https://docs.docker.com/compose/install/) installed (usually included with Docker Desktop/Docker Engine). + +## Files Overview + +- **Dockerfile**: Defines the environment (Python 3.9) and dependencies. +- **docker-compose.yml**: Orchestrates the container, maps ports (8000), and persists data (`news.db`). + +## Deployment Steps + +1. **Transfer Files**: Copy the entire project folder to your server. +2. **Navigate to Directory**: + ```bash + cd japan-news + ``` +3. **Start the Service**: + Run the following command to build and start the container in the background: + ```bash + docker-compose up -d --build + ``` + +## Managing the Service + +- **Check Logs**: + ```bash + docker-compose logs -f + ``` +- **Stop the Service**: + ```bash + docker-compose down + ``` +- **Restart**: + ```bash + docker-compose restart + ``` + +## Data Persistence + +The database file `news.db` is mapped to the container. +- Even if you stop or remove the container, your data in `news.db` on the host machine will remain safe. +- **Backup**: Simply backup the `news.db` file. + +## Accessing the Application + +Open your browser and navigate to: +`http://localhost:8000` (or your server's IP address: `http://:8000`) diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..2c80f19 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,15 @@ +FROM python:3.9-slim + +WORKDIR /app + +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +COPY . . + +# Create static directory if it doesn't exist +RUN mkdir -p static + +EXPOSE 8000 + +CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"] diff --git a/README.md b/README.md new file mode 100644 index 0000000..048b342 --- /dev/null +++ b/README.md @@ -0,0 +1,181 @@ +# Japan Senior News Collector + +일본 Yahoo Japan에서 시니어 관련 뉴스를 자동으로 수집하고 관리하는 웹 애플리케이션입니다. + +## 주요 기능 + +### 뉴스 수집 +- Yahoo Japan에서 4개 카테고리 뉴스 자동 수집 + - 건강 (Health) + - 생활 (Lifestyle) + - 경제 (Economy) + - 사회 (Society) +- 각 카테고리별 최대 5개 기사 수집 +- 기사 본문 콘텐츠 자동 추출 +- 콘텐츠가 없는 기사 자동 필터링 + +### 웹 UI +- 카테고리별 뉴스 카드 뷰 +- 기사 클릭 시 상세 모달 표시 +- 날짜별 히스토리 조회 +- 다크모드 지원 +- 통계 대시보드 (카테고리별 기사 수) +- JSON 다운로드 기능 + +## 기술 스택 + +- **Backend**: FastAPI, Python 3.9 +- **Database**: SQLite +- **Frontend**: HTML, Tailwind CSS, JavaScript +- **Scraping**: BeautifulSoup4, Requests +- **Container**: Docker + +## 설치 및 실행 + +### Docker 실행 (권장) + +```bash +# 이미지 빌드 +docker build -t japan-news . + +# 컨테이너 실행 +docker run -d --name japan-news -p 8001:8000 japan-news +``` + +### 로컬 실행 + +```bash +# 가상환경 생성 및 활성화 +python -m venv venv +source venv/bin/activate # Windows: venv\Scripts\activate + +# 의존성 설치 +pip install -r requirements.txt + +# 서버 실행 +uvicorn main:app --reload --port 8000 +``` + +## API 엔드포인트 + +### GET / +웹 UI 페이지 반환 + +### GET /api/today +오늘의 뉴스 조회. 오늘 수집된 기사가 없으면 자동으로 수집 후 반환. + +**응답 예시:** +```json +{ + "date": "2025-12-15", + "articles": { + "Economy": [...], + "Society": [...], + "Lifestyle": [...], + "Health": [...] + }, + "total_count": 19 +} +``` + +### GET /api/news +뉴스 목록 조회 + +**Query Parameters:** +- `date` (optional): 조회할 날짜 (YYYY-MM-DD 형식) + +### POST /api/collect-news +뉴스 수집 실행 + +**응답 예시:** +```json +{ + "status": "success", + "collected_count": 20, + "details": { + "Economy": 5, + "Society": 5, + "Lifestyle": 5, + "Health": 5 + } +} +``` + +### GET /api/dates +수집된 날짜 목록 조회 + +**응답 예시:** +```json +{ + "dates": ["2025-12-15", "2025-12-14", "2025-12-13"] +} +``` + +### GET /api/download-json +뉴스 데이터 JSON 파일 다운로드 + +**Query Parameters:** +- `date` (optional): 다운로드할 날짜 (YYYY-MM-DD 형식) + +## 프로젝트 구조 + +``` +japan-news/ +├── main.py # FastAPI 애플리케이션 +├── database.py # SQLite 데이터베이스 관리 +├── scraper.py # Yahoo Japan 뉴스 스크래퍼 +├── requirements.txt # Python 의존성 +├── Dockerfile # Docker 설정 +├── .dockerignore # Docker 빌드 제외 파일 +├── static/ +│ └── index.html # 웹 UI +└── README.md +``` + +## 데이터베이스 스키마 + +### articles 테이블 + +| 컬럼 | 타입 | 설명 | +|------|------|------| +| id | INTEGER | Primary Key | +| title | TEXT | 기사 제목 | +| url | TEXT | 기사 URL (UNIQUE) | +| image_url | TEXT | 썸네일 이미지 URL | +| published_date | TEXT | 발행일 | +| category | TEXT | 카테고리 | +| source | TEXT | 출처 | +| collected_at | TEXT | 수집 시간 (ISO format) | +| content | TEXT | 기사 본문 | + +## 외부 연동 + +`/api/today` 엔드포인트를 사용하면 외부 시스템에서 오늘의 뉴스를 쉽게 조회할 수 있습니다. + +```bash +# 오늘의 뉴스 조회 (없으면 자동 수집) +curl http://localhost:8001/api/today +``` + +## Docker 관리 명령어 + +```bash +# 로그 확인 +docker logs -f japan-news + +# 컨테이너 중지 +docker stop japan-news + +# 컨테이너 시작 +docker start japan-news + +# 컨테이너 삭제 +docker rm -f japan-news + +# 이미지 재빌드 후 실행 +docker rm -f japan-news && docker build -t japan-news . && docker run -d --name japan-news -p 8001:8000 japan-news +``` + +## 라이선스 + +MIT License diff --git a/database.py b/database.py new file mode 100644 index 0000000..c51439e --- /dev/null +++ b/database.py @@ -0,0 +1,103 @@ +import sqlite3 +from datetime import datetime, date +from typing import List, Optional +from pydantic import BaseModel + +DB_NAME = "news.db" + +class Article(BaseModel): + title: str + url: str + image_url: Optional[str] = None + published_date: Optional[str] = None + category: str + source: str = "Yahoo Japan" + collected_at: str = datetime.now().isoformat() + content: Optional[str] = None + +def init_db(): + conn = sqlite3.connect(DB_NAME) + c = conn.cursor() + c.execute(''' + CREATE TABLE IF NOT EXISTS articles ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + title TEXT NOT NULL, + url TEXT UNIQUE NOT NULL, + image_url TEXT, + published_date TEXT, + category TEXT, + source TEXT, + collected_at TEXT, + content TEXT + ) + ''') + conn.commit() + conn.close() + +def save_article(article: Article): + conn = sqlite3.connect(DB_NAME) + c = conn.cursor() + try: + # Check if content column exists (for migration) + cursor = c.execute("PRAGMA table_info(articles)") + columns = [info[1] for info in cursor.fetchall()] + if 'content' not in columns: + c.execute("ALTER TABLE articles ADD COLUMN content TEXT") + conn.commit() + + c.execute(''' + INSERT INTO articles (title, url, image_url, published_date, category, source, collected_at, content) + VALUES (?, ?, ?, ?, ?, ?, ?, ?) + ON CONFLICT(url) DO UPDATE SET + content = excluded.content, + image_url = excluded.image_url, + published_date = excluded.published_date + ''', (article.title, article.url, article.image_url, article.published_date, article.category, article.source, article.collected_at, article.content)) + conn.commit() + except Exception as e: + print(f"Error saving article: {e}") + finally: + conn.close() + +def get_articles(category: Optional[str] = None, collection_date: Optional[str] = None, limit: int = 5) -> List[dict]: + conn = sqlite3.connect(DB_NAME) + conn.row_factory = sqlite3.Row + c = conn.cursor() + + # Filter out articles without content + query = "SELECT * FROM articles WHERE content IS NOT NULL AND content != '' AND content != 'Content not found.'" + params = [] + + if category: + query += " AND category = ?" + params.append(category) + + if collection_date: + # Assuming collection_date is 'YYYY-MM-DD' and collected_at is ISO format + query += " AND date(collected_at) = ?" + params.append(collection_date) + else: + # Default to today if no date specified? Or just get latest? + # User said "collect news based on today". + # But for viewing, maybe we just want the latest batch. + # Let's order by collected_at desc + pass + + query += " ORDER BY collected_at DESC LIMIT ?" + params.append(limit) + + c.execute(query, tuple(params)) + rows = c.fetchall() + conn.close() + + + return [dict(row) for row in rows] + +def get_available_dates() -> List[str]: + conn = sqlite3.connect(DB_NAME) + c = conn.cursor() + # Extract distinct dates YYYY-MM-DD + c.execute("SELECT DISTINCT date(collected_at) as date_val FROM articles ORDER BY date_val DESC") + rows = c.fetchall() + conn.close() + return [row[0] for row in rows if row[0]] diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 0000000..c105c33 --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,11 @@ +version: '3.8' + +services: + japan-news: + build: . + container_name: japan-news-collector + ports: + - "8000:8000" + volumes: + - ./news.db:/app/news.db + restart: unless-stopped diff --git a/main.py b/main.py new file mode 100644 index 0000000..4ec03e9 --- /dev/null +++ b/main.py @@ -0,0 +1,126 @@ +from fastapi import FastAPI, HTTPException +from fastapi.responses import FileResponse +from fastapi.staticfiles import StaticFiles +from pydantic import BaseModel +from typing import List, Dict +import database +from scraper import NewsScraper +from datetime import datetime +import os + +app = FastAPI() + +# Initialize DB +database.init_db() + +# Serve static files +os.makedirs("static", exist_ok=True) +app.mount("/static", StaticFiles(directory="static"), name="static") + +@app.get("/") +async def read_root(): + return FileResponse('static/index.html') + +@app.post("/api/collect-news") +async def collect_news(): + scraper = NewsScraper() + categories = ["Economy", "Society", "Lifestyle", "Health"] + results = {} + + total_count = 0 + for cat in categories: + articles = scraper.scrape_category(cat, limit=5) + for article in articles: + database.save_article(article) + results[cat] = len(articles) + total_count += len(articles) + + return {"status": "success", "collected_count": total_count, "details": results} + +@app.get("/api/dates") +async def get_dates(): + dates = database.get_available_dates() + return {"dates": dates} + +from fastapi.responses import FileResponse, Response +import json + +@app.get("/api/download-json") +async def download_json(date: str = None): + # Reuse get_news logic + categories = ["Economy", "Society", "Lifestyle", "Health"] + data = {} + + # Use provided date or today/latest if None + # If date is None, get_articles(limit=5) gets latest regardless of date. + # To be precise for file name, if date is None, we might want to know the date of collected items? + # For now let's just use the logic we have. + + for cat in categories: + articles = database.get_articles(category=cat, collection_date=date, limit=5) + # Convert sqlite rows to dicts if not already (get_articles does it) + data[cat] = articles + + file_date = date if date else datetime.now().strftime("%Y-%m-%d") + filename = f"japan-news-{file_date}.json" + json_content = json.dumps(data, indent=2, ensure_ascii=False) + + return Response( + content=json_content, + media_type="application/json", + headers={"Content-Disposition": f"attachment; filename={filename}"} + ) + +@app.get("/api/news") +async def get_news(date: str = None): + # Helper to restructure for frontend + categories = ["Economy", "Society", "Lifestyle", "Health"] + response_data = {} + + for cat in categories: + articles = database.get_articles(category=cat, collection_date=date, limit=5) + response_data[cat] = articles + + return response_data + + +@app.get("/api/today") +async def get_today_news(): + """ + Get today's news. If no articles exist for today, collect them first. + Returns JSON with all categories. + """ + today = datetime.now().strftime("%Y-%m-%d") + categories = ["Economy", "Society", "Lifestyle", "Health"] + + # Check if we have any articles for today + has_today_articles = False + for cat in categories: + articles = database.get_articles(category=cat, collection_date=today, limit=1) + if articles: + has_today_articles = True + break + + # If no articles for today, collect them + if not has_today_articles: + scraper = NewsScraper() + for cat in categories: + articles = scraper.scrape_category(cat, limit=5) + for article in articles: + database.save_article(article) + + # Return today's articles + response_data = { + "date": today, + "articles": {} + } + + total_count = 0 + for cat in categories: + articles = database.get_articles(category=cat, collection_date=today, limit=5) + response_data["articles"][cat] = articles + total_count += len(articles) + + response_data["total_count"] = total_count + + return response_data diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..02c61bc --- /dev/null +++ b/requirements.txt @@ -0,0 +1,5 @@ +fastapi +uvicorn +beautifulsoup4 +requests +pydantic diff --git a/scraper.py b/scraper.py new file mode 100644 index 0000000..717311c --- /dev/null +++ b/scraper.py @@ -0,0 +1,167 @@ +import requests +from bs4 import BeautifulSoup +from typing import List, Optional +from database import Article +from datetime import datetime +import time +import random + +class NewsScraper: + BASE_URL = "https://news.yahoo.co.jp" + + CATEGORIES = { + "Economy": "https://news.yahoo.co.jp/categories/business", + "Society": "https://news.yahoo.co.jp/categories/domestic", + "Lifestyle": "https://news.yahoo.co.jp/categories/life", + "Health": "https://news.yahoo.co.jp/search?p=%E5%81%A5%E5%BA%B7&ei=utf-8" # Search for 'Health' + } + + HEALTH_KEYWORDS = ["健康", "医療", "病気", "病院", "医師", "薬", "ワクチン", "感染", "介護", "認知症", "老化", "ダイエット", "運動", "睡眠", "ストレス", "メンタル"] + + def scrape_category(self, category_name: str, limit: int = 5) -> List[Article]: + url = self.CATEGORIES.get(category_name) + if not url: + print(f"Unknown category: {category_name}") + return [] + + print(f"Scraping {category_name} from {url}...") + try: + response = requests.get(url) + response.raise_for_status() + soup = BeautifulSoup(response.content, "html.parser") + + articles = [] + + # Find all links that look like article links + # Yahoo Japan News article links typically contain 'news.yahoo.co.jp/articles/' or 'news.yahoo.co.jp/pickup/' + candidates = soup.find_all('a') + print(f"Found {len(candidates)} total links") + + seen_urls = set() + + for link in candidates: + if len(articles) >= limit: + break + + href = link.get('href') + if not href: + continue + + if 'news.yahoo.co.jp/articles/' in href or 'news.yahoo.co.jp/pickup/' in href: + # Clean up URL + if href.startswith('/'): + href = self.BASE_URL + href + + if href in seen_urls: + continue + + # Extract title + title = link.get_text(strip=True) + if len(title) < 5: + continue + + if category_name == "Health": + pass + + # Image extraction + img_tag = link.find('img') + image_url = img_tag.get('src') if img_tag else None + + seen_urls.add(href) + + print(f"Found article: {title}") + + # Handle Pickup URLs - Resolve to real article URL + final_url = href + if "/pickup/" in href: + print(f" Resolving pickup URL: {href}") + real_url = self.resolve_pickup_url(href) + if real_url: + print(f" -> Resolved to: {real_url}") + final_url = real_url + + article = Article( + title=title, + url=final_url, # Store the final URL + image_url=image_url, + category=category_name, + published_date=datetime.now().strftime("%Y-%m-%d"), + collected_at=datetime.now().isoformat() + ) + + # Fetch Full Content + try: + print(f" Fetching content for {title[:10]}...") + content = self.scrape_article_body(final_url) + article.content = content + time.sleep(random.uniform(0.5, 1.5)) + except Exception as e: + print(f" Failed to fetch content: {e}") + article.content = "Failed to load content." + + articles.append(article) + + print(f"Total articles collected for {category_name}: {len(articles)}") + + except Exception as e: + print(f"Error scraping {category_name}: {e}") + import traceback + traceback.print_exc() + return [] + + return articles + + def resolve_pickup_url(self, pickup_url: str) -> Optional[str]: + try: + response = requests.get(pickup_url) + response.raise_for_status() + soup = BeautifulSoup(response.content, "html.parser") + + # Look for "続きを読む" link + link = soup.find('a', string=lambda t: t and '続きを読む' in t) + if link and link.get('href'): + return link.get('href') + + # Fallback: look for any news.yahoo.co.jp/articles/ link in the main content area + # Usually pickup pages have a clear link to the full story + candidates = soup.find_all('a') + for l in candidates: + href = l.get('href') + if href and 'news.yahoo.co.jp/articles/' in href: + return href + + return None + except Exception as e: + print(f"Error resolving pickup URL {pickup_url}: {e}") + return None + + def scrape_article_body(self, url: str) -> str: + try: + response = requests.get(url) + response.raise_for_status() + soup = BeautifulSoup(response.content, "html.parser") + + # Selector identified via browser tool: div.sc-iMCRTP.eqMceQ.yjSlinkDirectLink (or generic sc-iMCRTP) + # Collecting all paragraphs within the main article body container + # We look for the container div + container = soup.select_one("div.sc-iMCRTP") + if not container: + # Fallback to article_body class search if specific class changes + container = soup.find("div", class_=lambda x: x and "article_body" in x) + + if container: + paragraphs = container.find_all('p') + text = "\n\n".join([p.get_text(strip=True) for p in paragraphs]) + return text + + return "Content not found." + + except Exception as e: + print(f"Error scraping body from {url}: {e}") + return "" + +if __name__ == "__main__": + # Test run + scraper = NewsScraper() + news = scraper.scrape_category("Society", limit=2) + print(news) diff --git a/static/index.html b/static/index.html new file mode 100644 index 0000000..c0c7dea --- /dev/null +++ b/static/index.html @@ -0,0 +1,667 @@ + + + + + + + Japan Senior News Collector + + + + + + + + + + + +
+ + +
+
+

Senior Daily News Collector

+

Viewing: Latest

+
+
+ + +
+
+ + +
+ + + + + +
+
+ +
+
+
+

Total Articles

+

0

+
+
+ + + +
+
+
+ + +
+
+
+

Health

+

0

+
+
+ 🏥 +
+
+
+ +
+
+
+

Lifestyle

+

0

+
+
+ 🏡 +
+
+
+ +
+
+
+

Economy

+

0

+
+
+ 💼 +
+
+
+ +
+
+
+

Society

+

0

+
+
+ 📢 +
+
+
+
+
+ + +
+ +
+
+

+ 🏥 健康 (Health) +

+
+
+

No news collected yet.

+
+
+ + +
+
+

+ 🏡 生活 (Lifestyle) +

+
+
+

No news collected yet.

+
+
+ + +
+
+

+ 💼 経済 (Economy) +

+
+
+

No news collected yet.

+
+
+ + +
+
+

+ 📢 社会 (Society) +

+
+
+

No news collected yet.

+
+
+
+
+
+ + + + + + + + + + +