Still not good. Changes to the sites still break shit. NovelUpdates.com added cloudfare, so I had to switch from requests to cloudscrape. The Ziru's Musings site also took away a layer of linking, so I rewrote that parsing section. I'm still hitting an issue with some MoonBunny site that seems to be abandoned and breaking everything. I haven't been able to work around that site yet.

This commit is contained in:
2021-03-10 00:17:59 -05:00
parent fd3776cc4a
commit 52c3c4bb7b

74
main.py
View File

@@ -1,45 +1,61 @@
import requests
import cloudscraper
import pprint as pp
from bs4 import BeautifulSoup as bs
import os.path
pg = 27
scraper = cloudscraper.create_scraper()
pg = 25
while pg > 1:
baseURL = 'https://www.novelupdates.com/'
series = 'lazy-dungeon-master'
page = requests.get(baseURL + '/series/' + series + '?pg=' + str(pg))
page = scraper.get(baseURL + '/series/' + series + '?pg=' + str(pg))
print(baseURL + 'series/' + series + '?pg=' + str(pg))
# pp.pprint(page.content)
soup = bs(page.content, 'html.parser')
# print(soup)
result = soup.find(id='myTable')
# print(result)
for a in result.find_all('a', href=True, title=True):
if 'extnu' in a['href']:
print(a['title'])
print(a['href'].strip('//'))
followURL = 'http://' + a['href'].strip('//')
followPage = requests.get(followURL)
followSoup = bs(followPage.content, 'html.parser')
# test = followSoup.find('link', href=True)
if 'https://coronatranslation.blogspot.com/favicon.ico' in followSoup.find('link', href=True)['href']:
story = followSoup.find('div', class_='post-body entry-content float-container')
story_text = story.get_text()
# print(story_text)
f = open(a['title']+".txt", "w", encoding="utf-8")
f.write(story_text)
f.close()
elif 'Ziru' in followSoup.find('link', title="Ziru's Musings » Feed")['title']:
storyURL = followSoup.find('a', text='Read Chapter Here', href=True)['href']
storyPage = requests.get(storyURL)
storySoup = bs(storyPage.content, 'html.parser')
story = storySoup.find('div', class_='elementor-element elementor-element-7ba99198 elementor-widget elementor-widget-theme-post-content')
story_text = story.get_text()
# print(story_text)
f = open(a['title'] + ".txt", "w", encoding="utf-8")
f.write(story_text)
f.close()
# print(a['title'])
# print(a['href'].strip('//'))
if os.path.exists(a['title']+'.txt'):
print('skipping', a['title'])
elif a['title'][0].lower() == 'c':
print(a['title'])
print(a['href'].strip('//'))
followURL = 'http://' + a['href'].strip('//')
followPage = scraper.get(followURL)
followSoup = bs(followPage.content, 'html.parser')
if 'http://moonbunnycafe.com/wp-content/uploads/2015/07/bunbun.jpeg' in followSoup.find('link', href=True)['href']:
break
# test = followSoup.find('link', href=True)
elif 'https://coronatranslation.blogspot.com/favicon.ico' in followSoup.find('link', href=True)['href']:
story = followSoup.find('div', class_='post-body entry-content float-container')
story_text = story.get_text()
# print(story_text)
f = open(a['title']+".txt", "w", encoding="utf-8")
f.write(story_text)
f.close()
elif 'Ziru' in followSoup.find('link', title="Ziru's Musings » Feed")['title']:
# storyURL = followSoup.find('a', text='Read Chapter Here', href=True)['href']
# storyPage = scraper.get(storyURL)
# storySoup = bs(storyPage.content, 'html.parser')
# story = storySoup.find('div', class_='elementor-element elementor-element-7ba99198 elementor-widget elementor-widget-theme-post-content')
story = followSoup.find('div', class_='elementor-element elementor-element-7ba99198 elementor-widget elementor-widget-theme-post-content')
if story is not None:
story_text = story.get_text()
# print(story_text)
f = open(a['title'] + ".txt", "w", encoding="utf-8")
f.write(story_text)
f.close()
else:
print('fuck')
else:
print('fuck')
print('junk')
pg = pg-1
# exit()
# exit()