Compare commits
2 Commits
581ea49a75
...
main
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
8df977eeb0 | ||
|
|
2d23d8dfc3 |
33
.gitea/workflows/deploy.yml
Normal file
33
.gitea/workflows/deploy.yml
Normal file
@@ -0,0 +1,33 @@
|
|||||||
|
name: Deploy to Home Server
|
||||||
|
|
||||||
|
on:
|
||||||
|
push:
|
||||||
|
branches:
|
||||||
|
- main
|
||||||
|
|
||||||
|
env:
|
||||||
|
DEPLOY_PATH: /home/bini/project/collect-japan-news
|
||||||
|
|
||||||
|
jobs:
|
||||||
|
deploy:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
steps:
|
||||||
|
- name: Pull latest code
|
||||||
|
run: |
|
||||||
|
cd ${{ env.DEPLOY_PATH }}
|
||||||
|
git pull origin main
|
||||||
|
|
||||||
|
- name: Rebuild and restart container
|
||||||
|
run: |
|
||||||
|
cd ${{ env.DEPLOY_PATH }}
|
||||||
|
docker compose down
|
||||||
|
docker compose build --no-cache
|
||||||
|
docker compose up -d
|
||||||
|
|
||||||
|
- name: Cleanup old images
|
||||||
|
run: docker image prune -f
|
||||||
|
|
||||||
|
- name: Health check
|
||||||
|
run: |
|
||||||
|
sleep 5
|
||||||
|
curl -f http://localhost:8001 || exit 1
|
||||||
137
scraper.py
137
scraper.py
@@ -13,12 +13,39 @@ class NewsScraper:
|
|||||||
"Economy": "https://news.yahoo.co.jp/categories/business",
|
"Economy": "https://news.yahoo.co.jp/categories/business",
|
||||||
"Society": "https://news.yahoo.co.jp/categories/domestic",
|
"Society": "https://news.yahoo.co.jp/categories/domestic",
|
||||||
"Lifestyle": "https://news.yahoo.co.jp/categories/life",
|
"Lifestyle": "https://news.yahoo.co.jp/categories/life",
|
||||||
"Health": "https://news.yahoo.co.jp/search?p=%E5%81%A5%E5%BA%B7&ei=utf-8" # Search for 'Health'
|
"Health": None, # Special handling - uses multiple search sources
|
||||||
}
|
}
|
||||||
|
|
||||||
HEALTH_KEYWORDS = ["健康", "医療", "病気", "病院", "医師", "薬", "ワクチン", "感染", "介護", "認知症", "老化", "ダイエット", "運動", "睡眠", "ストレス", "メンタル"]
|
# Multiple search sources for senior-friendly health content
|
||||||
|
HEALTH_SEARCH_SOURCES = [
|
||||||
|
# 건강 지식
|
||||||
|
"https://news.yahoo.co.jp/search?p=%E5%81%A5%E5%BA%B7%E6%B3%95+%E7%94%9F%E6%B4%BB%E7%BF%92%E6%85%A3&ei=utf-8", # 健康法 生活習慣
|
||||||
|
# 좋은 음식/영양
|
||||||
|
"https://news.yahoo.co.jp/search?p=%E5%81%A5%E5%BA%B7+%E9%A3%9F%E4%BA%8B+%E6%A0%84%E9%A4%8A&ei=utf-8", # 健康 食事 栄養
|
||||||
|
# 시니어 운동/체조
|
||||||
|
"https://news.yahoo.co.jp/search?p=%E4%BD%93%E6%93%8D+%E3%82%B9%E3%83%88%E3%83%AC%E3%83%83%E3%83%81+%E3%82%A6%E3%82%A9%E3%83%BC%E3%82%AD%E3%83%B3%E3%82%B0&ei=utf-8", # 体操 ストレッチ ウォーキング
|
||||||
|
# 질병 예방
|
||||||
|
"https://news.yahoo.co.jp/search?p=%E8%AA%8D%E7%9F%A5%E7%97%87%E4%BA%88%E9%98%B2+%E8%80%81%E5%8C%96%E9%98%B2%E6%AD%A2&ei=utf-8", # 認知症予防 老化防止
|
||||||
|
]
|
||||||
|
|
||||||
|
# Keywords for senior-friendly health content
|
||||||
|
HEALTH_POSITIVE_KEYWORDS = [
|
||||||
|
"健康法", "栄養", "食事", "食べ物", "レシピ", "体操", "ストレッチ", "ウォーキング",
|
||||||
|
"睡眠", "生活習慣", "予防", "免疫", "認知症予防", "筋トレ", "シニア", "高齢者",
|
||||||
|
"長寿", "元気", "若返り", "老化防止", "サプリ", "ビタミン", "血圧", "血糖値"
|
||||||
|
]
|
||||||
|
|
||||||
|
# Keywords to filter out (sports/entertainment news)
|
||||||
|
HEALTH_EXCLUDE_KEYWORDS = [
|
||||||
|
"選手", "試合", "優勝", "大会", "リーグ", "プロ野球", "サッカー", "相撲",
|
||||||
|
"芸能", "俳優", "女優", "タレント", "アイドル", "ドラマ", "映画"
|
||||||
|
]
|
||||||
|
|
||||||
def scrape_category(self, category_name: str, limit: int = 5) -> List[Article]:
|
def scrape_category(self, category_name: str, limit: int = 5) -> List[Article]:
|
||||||
|
# Special handling for Health category - use multiple sources
|
||||||
|
if category_name == "Health":
|
||||||
|
return self._scrape_health_articles(limit)
|
||||||
|
|
||||||
url = self.CATEGORIES.get(category_name)
|
url = self.CATEGORIES.get(category_name)
|
||||||
if not url:
|
if not url:
|
||||||
print(f"Unknown category: {category_name}")
|
print(f"Unknown category: {category_name}")
|
||||||
@@ -60,8 +87,12 @@ class NewsScraper:
|
|||||||
if len(title) < 5:
|
if len(title) < 5:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if category_name == "Health":
|
# Filter health articles to exclude sports/entertainment
|
||||||
pass
|
if category_name.startswith("Health"):
|
||||||
|
# Skip if title contains excluded keywords
|
||||||
|
if any(kw in title for kw in self.HEALTH_EXCLUDE_KEYWORDS):
|
||||||
|
print(f" Skipping (excluded): {title[:30]}...")
|
||||||
|
continue
|
||||||
|
|
||||||
# Image extraction
|
# Image extraction
|
||||||
img_tag = link.find('img')
|
img_tag = link.find('img')
|
||||||
@@ -140,7 +171,7 @@ class NewsScraper:
|
|||||||
response = requests.get(url)
|
response = requests.get(url)
|
||||||
response.raise_for_status()
|
response.raise_for_status()
|
||||||
soup = BeautifulSoup(response.content, "html.parser")
|
soup = BeautifulSoup(response.content, "html.parser")
|
||||||
|
|
||||||
# Selector identified via browser tool: div.sc-iMCRTP.eqMceQ.yjSlinkDirectLink (or generic sc-iMCRTP)
|
# Selector identified via browser tool: div.sc-iMCRTP.eqMceQ.yjSlinkDirectLink (or generic sc-iMCRTP)
|
||||||
# Collecting all paragraphs within the main article body container
|
# Collecting all paragraphs within the main article body container
|
||||||
# We look for the container div
|
# We look for the container div
|
||||||
@@ -148,18 +179,108 @@ class NewsScraper:
|
|||||||
if not container:
|
if not container:
|
||||||
# Fallback to article_body class search if specific class changes
|
# Fallback to article_body class search if specific class changes
|
||||||
container = soup.find("div", class_=lambda x: x and "article_body" in x)
|
container = soup.find("div", class_=lambda x: x and "article_body" in x)
|
||||||
|
|
||||||
if container:
|
if container:
|
||||||
paragraphs = container.find_all('p')
|
paragraphs = container.find_all('p')
|
||||||
text = "\n\n".join([p.get_text(strip=True) for p in paragraphs])
|
text = "\n\n".join([p.get_text(strip=True) for p in paragraphs])
|
||||||
return text
|
return text
|
||||||
|
|
||||||
return "Content not found."
|
return "Content not found."
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
print(f"Error scraping body from {url}: {e}")
|
print(f"Error scraping body from {url}: {e}")
|
||||||
return ""
|
return ""
|
||||||
|
|
||||||
|
def _scrape_health_articles(self, limit: int = 5) -> List[Article]:
|
||||||
|
"""Scrape health articles from multiple sources with filtering for senior-friendly content."""
|
||||||
|
print("Scraping Health articles from multiple sources...")
|
||||||
|
|
||||||
|
all_articles = []
|
||||||
|
seen_urls = set()
|
||||||
|
articles_per_source = max(2, limit // len(self.HEALTH_SEARCH_SOURCES) + 1)
|
||||||
|
|
||||||
|
for source_url in self.HEALTH_SEARCH_SOURCES:
|
||||||
|
if len(all_articles) >= limit:
|
||||||
|
break
|
||||||
|
|
||||||
|
print(f" Searching: {source_url[:80]}...")
|
||||||
|
|
||||||
|
try:
|
||||||
|
response = requests.get(source_url)
|
||||||
|
response.raise_for_status()
|
||||||
|
soup = BeautifulSoup(response.content, "html.parser")
|
||||||
|
|
||||||
|
candidates = soup.find_all('a')
|
||||||
|
source_count = 0
|
||||||
|
|
||||||
|
for link in candidates:
|
||||||
|
if source_count >= articles_per_source or len(all_articles) >= limit:
|
||||||
|
break
|
||||||
|
|
||||||
|
href = link.get('href')
|
||||||
|
if not href:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if 'news.yahoo.co.jp/articles/' in href or 'news.yahoo.co.jp/pickup/' in href:
|
||||||
|
if href.startswith('/'):
|
||||||
|
href = self.BASE_URL + href
|
||||||
|
|
||||||
|
if href in seen_urls:
|
||||||
|
continue
|
||||||
|
|
||||||
|
title = link.get_text(strip=True)
|
||||||
|
if len(title) < 5:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Filter out sports/entertainment news
|
||||||
|
if any(kw in title for kw in self.HEALTH_EXCLUDE_KEYWORDS):
|
||||||
|
print(f" Skipping (sports/entertainment): {title[:30]}...")
|
||||||
|
continue
|
||||||
|
|
||||||
|
seen_urls.add(href)
|
||||||
|
print(f" Found: {title[:40]}...")
|
||||||
|
|
||||||
|
# Image extraction
|
||||||
|
img_tag = link.find('img')
|
||||||
|
image_url = img_tag.get('src') if img_tag else None
|
||||||
|
|
||||||
|
# Resolve pickup URLs
|
||||||
|
final_url = href
|
||||||
|
if "/pickup/" in href:
|
||||||
|
real_url = self.resolve_pickup_url(href)
|
||||||
|
if real_url:
|
||||||
|
final_url = real_url
|
||||||
|
|
||||||
|
article = Article(
|
||||||
|
title=title,
|
||||||
|
url=final_url,
|
||||||
|
image_url=image_url,
|
||||||
|
category="Health",
|
||||||
|
published_date=datetime.now().strftime("%Y-%m-%d"),
|
||||||
|
collected_at=datetime.now().isoformat()
|
||||||
|
)
|
||||||
|
|
||||||
|
# Fetch content
|
||||||
|
try:
|
||||||
|
content = self.scrape_article_body(final_url)
|
||||||
|
article.content = content
|
||||||
|
time.sleep(random.uniform(0.5, 1.0))
|
||||||
|
except Exception as e:
|
||||||
|
print(f" Failed to fetch content: {e}")
|
||||||
|
article.content = "Failed to load content."
|
||||||
|
|
||||||
|
all_articles.append(article)
|
||||||
|
source_count += 1
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f" Error with source {source_url[:50]}: {e}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
time.sleep(random.uniform(0.5, 1.0))
|
||||||
|
|
||||||
|
print(f"Total Health articles collected: {len(all_articles)}")
|
||||||
|
return all_articles
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
# Test run
|
# Test run
|
||||||
scraper = NewsScraper()
|
scraper = NewsScraper()
|
||||||
|
|||||||
Reference in New Issue
Block a user