Add Gitea Actions workflow for auto-deployment

- Deploy to /home/bini/project/collect-japan-news on push to main - Pull code, rebuild Docker container, and restart - Include health check after deployment 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
Improve Health category with senior-friendly content sources
2025-12-15 17:12:31 +09:00 · 2025-12-15 17:08:37 +09:00
2 changed files with 162 additions and 8 deletions
--- a/.gitea/workflows/deploy.yml
+++ b/.gitea/workflows/deploy.yml
@@ -0,0 +1,33 @@
 name: Deploy to Home Server
 on:
  push:
    branches:
      - main
 env:
  DEPLOY_PATH: /home/bini/project/collect-japan-news
 jobs:
  deploy:
    runs-on: ubuntu-latest
    steps:
      - name: Pull latest code
        run: |
          cd ${{ env.DEPLOY_PATH }}
          git pull origin main
      - name: Rebuild and restart container
        run: |
          cd ${{ env.DEPLOY_PATH }}
          docker compose down
          docker compose build --no-cache
          docker compose up -d
      - name: Cleanup old images
        run: docker image prune -f
      - name: Health check
        run: |
          sleep 5
          curl -f http://localhost:8001 || exit 1
--- a/scraper.py
+++ b/scraper.py
@@ -13,12 +13,39 @@ class NewsScraper:
        "Economy": "https://news.yahoo.co.jp/categories/business",
        "Society": "https://news.yahoo.co.jp/categories/domestic",
        "Lifestyle": "https://news.yahoo.co.jp/categories/life",
-        "Health": "https://news.yahoo.co.jp/search?p=%E5%81%A5%E5%BA%B7&ei=utf-8"  # Search for 'Health'
+        "Health": None,  # Special handling - uses multiple search sources
    }
-    HEALTH_KEYWORDS = ["健康", "医療", "病気", "病院", "医師", "薬", "ワクチン", "感染", "介護", "認知症", "老化", "ダイエット", "運動", "睡眠", "ストレス", "メンタル"]
+    # Multiple search sources for senior-friendly health content
    HEALTH_SEARCH_SOURCES = [
        # 건강 지식
        "https://news.yahoo.co.jp/search?p=%E5%81%A5%E5%BA%B7%E6%B3%95+%E7%94%9F%E6%B4%BB%E7%BF%92%E6%85%A3&ei=utf-8",  # 健康法 生活習慣
        # 좋은 음식/영양
        "https://news.yahoo.co.jp/search?p=%E5%81%A5%E5%BA%B7+%E9%A3%9F%E4%BA%8B+%E6%A0%84%E9%A4%8A&ei=utf-8",  # 健康 食事 栄養
        # 시니어 운동/체조
        "https://news.yahoo.co.jp/search?p=%E4%BD%93%E6%93%8D+%E3%82%B9%E3%83%88%E3%83%AC%E3%83%83%E3%83%81+%E3%82%A6%E3%82%A9%E3%83%BC%E3%82%AD%E3%83%B3%E3%82%B0&ei=utf-8",  # 体操 ストレッチ ウォーキング
        # 질병 예방
        "https://news.yahoo.co.jp/search?p=%E8%AA%8D%E7%9F%A5%E7%97%87%E4%BA%88%E9%98%B2+%E8%80%81%E5%8C%96%E9%98%B2%E6%AD%A2&ei=utf-8",  # 認知症予防 老化防止
    ]
    # Keywords for senior-friendly health content
    HEALTH_POSITIVE_KEYWORDS = [
        "健康法", "栄養", "食事", "食べ物", "レシピ", "体操", "ストレッチ", "ウォーキング",
        "睡眠", "生活習慣", "予防", "免疫", "認知症予防", "筋トレ", "シニア", "高齢者",
        "長寿", "元気", "若返り", "老化防止", "サプリ", "ビタミン", "血圧", "血糖値"
    ]
    # Keywords to filter out (sports/entertainment news)
    HEALTH_EXCLUDE_KEYWORDS = [
        "選手", "試合", "優勝", "大会", "リーグ", "プロ野球", "サッカー", "相撲",
        "芸能", "俳優", "女優", "タレント", "アイドル", "ドラマ", "映画"
    ]
    def scrape_category(self, category_name: str, limit: int = 5) -> List[Article]:
        # Special handling for Health category - use multiple sources
        if category_name == "Health":
            return self._scrape_health_articles(limit)
        url = self.CATEGORIES.get(category_name)
        if not url:
            print(f"Unknown category: {category_name}")
@@ -60,8 +87,12 @@ class NewsScraper:
                    if len(title) < 5:
                        continue
-                    if category_name == "Health":
+                    # Filter health articles to exclude sports/entertainment
-                        pass 
+                    if category_name.startswith("Health"):
                        # Skip if title contains excluded keywords
                        if any(kw in title for kw in self.HEALTH_EXCLUDE_KEYWORDS):
                            print(f"  Skipping (excluded): {title[:30]}...")
                            continue
                    # Image extraction
                    img_tag = link.find('img')
@@ -140,7 +171,7 @@ class NewsScraper:
            response = requests.get(url)
            response.raise_for_status()
            soup = BeautifulSoup(response.content, "html.parser")
-            
+
            # Selector identified via browser tool: div.sc-iMCRTP.eqMceQ.yjSlinkDirectLink (or generic sc-iMCRTP)
            # Collecting all paragraphs within the main article body container
            # We look for the container div
@@ -148,18 +179,108 @@ class NewsScraper:
            if not container:
                # Fallback to article_body class search if specific class changes
                container = soup.find("div", class_=lambda x: x and "article_body" in x)
-            
+
            if container:
                paragraphs = container.find_all('p')
                text = "\n\n".join([p.get_text(strip=True) for p in paragraphs])
                return text
-            
+
            return "Content not found."
-            
+
        except Exception as e:
            print(f"Error scraping body from {url}: {e}")
            return ""
    def _scrape_health_articles(self, limit: int = 5) -> List[Article]:
        """Scrape health articles from multiple sources with filtering for senior-friendly content."""
        print("Scraping Health articles from multiple sources...")
        all_articles = []
        seen_urls = set()
        articles_per_source = max(2, limit // len(self.HEALTH_SEARCH_SOURCES) + 1)
        for source_url in self.HEALTH_SEARCH_SOURCES:
            if len(all_articles) >= limit:
                break
            print(f"  Searching: {source_url[:80]}...")
            try:
                response = requests.get(source_url)
                response.raise_for_status()
                soup = BeautifulSoup(response.content, "html.parser")
                candidates = soup.find_all('a')
                source_count = 0
                for link in candidates:
                    if source_count >= articles_per_source or len(all_articles) >= limit:
                        break
                    href = link.get('href')
                    if not href:
                        continue
                    if 'news.yahoo.co.jp/articles/' in href or 'news.yahoo.co.jp/pickup/' in href:
                        if href.startswith('/'):
                            href = self.BASE_URL + href
                        if href in seen_urls:
                            continue
                        title = link.get_text(strip=True)
                        if len(title) < 5:
                            continue
                        # Filter out sports/entertainment news
                        if any(kw in title for kw in self.HEALTH_EXCLUDE_KEYWORDS):
                            print(f"    Skipping (sports/entertainment): {title[:30]}...")
                            continue
                        seen_urls.add(href)
                        print(f"    Found: {title[:40]}...")
                        # Image extraction
                        img_tag = link.find('img')
                        image_url = img_tag.get('src') if img_tag else None
                        # Resolve pickup URLs
                        final_url = href
                        if "/pickup/" in href:
                            real_url = self.resolve_pickup_url(href)
                            if real_url:
                                final_url = real_url
                        article = Article(
                            title=title,
                            url=final_url,
                            image_url=image_url,
                            category="Health",
                            published_date=datetime.now().strftime("%Y-%m-%d"),
                            collected_at=datetime.now().isoformat()
                        )
                        # Fetch content
                        try:
                            content = self.scrape_article_body(final_url)
                            article.content = content
                            time.sleep(random.uniform(0.5, 1.0))
                        except Exception as e:
                            print(f"    Failed to fetch content: {e}")
                            article.content = "Failed to load content."
                        all_articles.append(article)
                        source_count += 1
            except Exception as e:
                print(f"  Error with source {source_url[:50]}: {e}")
                continue
            time.sleep(random.uniform(0.5, 1.0))
        print(f"Total Health articles collected: {len(all_articles)}")
        return all_articles
 if __name__ == "__main__":
    # Test run
    scraper = NewsScraper()