diff --git a/scraper.py b/scraper.py index 717311c..6cbdd2d 100644 --- a/scraper.py +++ b/scraper.py @@ -13,12 +13,39 @@ class NewsScraper: "Economy": "https://news.yahoo.co.jp/categories/business", "Society": "https://news.yahoo.co.jp/categories/domestic", "Lifestyle": "https://news.yahoo.co.jp/categories/life", - "Health": "https://news.yahoo.co.jp/search?p=%E5%81%A5%E5%BA%B7&ei=utf-8" # Search for 'Health' + "Health": None, # Special handling - uses multiple search sources } - HEALTH_KEYWORDS = ["健康", "医療", "病気", "病院", "医師", "薬", "ワクチン", "感染", "介護", "認知症", "老化", "ダイエット", "運動", "睡眠", "ストレス", "メンタル"] + # Multiple search sources for senior-friendly health content + HEALTH_SEARCH_SOURCES = [ + # 건강 지식 + "https://news.yahoo.co.jp/search?p=%E5%81%A5%E5%BA%B7%E6%B3%95+%E7%94%9F%E6%B4%BB%E7%BF%92%E6%85%A3&ei=utf-8", # 健康法 生活習慣 + # 좋은 음식/영양 + "https://news.yahoo.co.jp/search?p=%E5%81%A5%E5%BA%B7+%E9%A3%9F%E4%BA%8B+%E6%A0%84%E9%A4%8A&ei=utf-8", # 健康 食事 栄養 + # 시니어 운동/체조 + "https://news.yahoo.co.jp/search?p=%E4%BD%93%E6%93%8D+%E3%82%B9%E3%83%88%E3%83%AC%E3%83%83%E3%83%81+%E3%82%A6%E3%82%A9%E3%83%BC%E3%82%AD%E3%83%B3%E3%82%B0&ei=utf-8", # 体操 ストレッチ ウォーキング + # 질병 예방 + "https://news.yahoo.co.jp/search?p=%E8%AA%8D%E7%9F%A5%E7%97%87%E4%BA%88%E9%98%B2+%E8%80%81%E5%8C%96%E9%98%B2%E6%AD%A2&ei=utf-8", # 認知症予防 老化防止 + ] + + # Keywords for senior-friendly health content + HEALTH_POSITIVE_KEYWORDS = [ + "健康法", "栄養", "食事", "食べ物", "レシピ", "体操", "ストレッチ", "ウォーキング", + "睡眠", "生活習慣", "予防", "免疫", "認知症予防", "筋トレ", "シニア", "高齢者", + "長寿", "元気", "若返り", "老化防止", "サプリ", "ビタミン", "血圧", "血糖値" + ] + + # Keywords to filter out (sports/entertainment news) + HEALTH_EXCLUDE_KEYWORDS = [ + "選手", "試合", "優勝", "大会", "リーグ", "プロ野球", "サッカー", "相撲", + "芸能", "俳優", "女優", "タレント", "アイドル", "ドラマ", "映画" + ] def scrape_category(self, category_name: str, limit: int = 5) -> List[Article]: + # Special handling for Health category - use multiple sources + if category_name == "Health": + return self._scrape_health_articles(limit) + url = self.CATEGORIES.get(category_name) if not url: print(f"Unknown category: {category_name}") @@ -60,8 +87,12 @@ class NewsScraper: if len(title) < 5: continue - if category_name == "Health": - pass + # Filter health articles to exclude sports/entertainment + if category_name.startswith("Health"): + # Skip if title contains excluded keywords + if any(kw in title for kw in self.HEALTH_EXCLUDE_KEYWORDS): + print(f" Skipping (excluded): {title[:30]}...") + continue # Image extraction img_tag = link.find('img') @@ -140,7 +171,7 @@ class NewsScraper: response = requests.get(url) response.raise_for_status() soup = BeautifulSoup(response.content, "html.parser") - + # Selector identified via browser tool: div.sc-iMCRTP.eqMceQ.yjSlinkDirectLink (or generic sc-iMCRTP) # Collecting all paragraphs within the main article body container # We look for the container div @@ -148,18 +179,108 @@ class NewsScraper: if not container: # Fallback to article_body class search if specific class changes container = soup.find("div", class_=lambda x: x and "article_body" in x) - + if container: paragraphs = container.find_all('p') text = "\n\n".join([p.get_text(strip=True) for p in paragraphs]) return text - + return "Content not found." - + except Exception as e: print(f"Error scraping body from {url}: {e}") return "" + def _scrape_health_articles(self, limit: int = 5) -> List[Article]: + """Scrape health articles from multiple sources with filtering for senior-friendly content.""" + print("Scraping Health articles from multiple sources...") + + all_articles = [] + seen_urls = set() + articles_per_source = max(2, limit // len(self.HEALTH_SEARCH_SOURCES) + 1) + + for source_url in self.HEALTH_SEARCH_SOURCES: + if len(all_articles) >= limit: + break + + print(f" Searching: {source_url[:80]}...") + + try: + response = requests.get(source_url) + response.raise_for_status() + soup = BeautifulSoup(response.content, "html.parser") + + candidates = soup.find_all('a') + source_count = 0 + + for link in candidates: + if source_count >= articles_per_source or len(all_articles) >= limit: + break + + href = link.get('href') + if not href: + continue + + if 'news.yahoo.co.jp/articles/' in href or 'news.yahoo.co.jp/pickup/' in href: + if href.startswith('/'): + href = self.BASE_URL + href + + if href in seen_urls: + continue + + title = link.get_text(strip=True) + if len(title) < 5: + continue + + # Filter out sports/entertainment news + if any(kw in title for kw in self.HEALTH_EXCLUDE_KEYWORDS): + print(f" Skipping (sports/entertainment): {title[:30]}...") + continue + + seen_urls.add(href) + print(f" Found: {title[:40]}...") + + # Image extraction + img_tag = link.find('img') + image_url = img_tag.get('src') if img_tag else None + + # Resolve pickup URLs + final_url = href + if "/pickup/" in href: + real_url = self.resolve_pickup_url(href) + if real_url: + final_url = real_url + + article = Article( + title=title, + url=final_url, + image_url=image_url, + category="Health", + published_date=datetime.now().strftime("%Y-%m-%d"), + collected_at=datetime.now().isoformat() + ) + + # Fetch content + try: + content = self.scrape_article_body(final_url) + article.content = content + time.sleep(random.uniform(0.5, 1.0)) + except Exception as e: + print(f" Failed to fetch content: {e}") + article.content = "Failed to load content." + + all_articles.append(article) + source_count += 1 + + except Exception as e: + print(f" Error with source {source_url[:50]}: {e}") + continue + + time.sleep(random.uniform(0.5, 1.0)) + + print(f"Total Health articles collected: {len(all_articles)}") + return all_articles + if __name__ == "__main__": # Test run scraper = NewsScraper()