Still not good. Changes to the sites still break shit. NovelUpdates.com added cloudfare, so I had to switch from requests to cloudscrape. The Ziru's Musings site also took away a layer of linking, so I rewrote that parsing section. I'm still hitting an issue with some MoonBunny site that seems to be abandoned and breaking everything. I haven't been able to work around that site yet.
This commit is contained in:
74
main.py
74
main.py
@@ -1,45 +1,61 @@
|
||||
import requests
|
||||
import cloudscraper
|
||||
import pprint as pp
|
||||
from bs4 import BeautifulSoup as bs
|
||||
import os.path
|
||||
|
||||
pg = 27
|
||||
scraper = cloudscraper.create_scraper()
|
||||
pg = 25
|
||||
|
||||
while pg > 1:
|
||||
|
||||
baseURL = 'https://www.novelupdates.com/'
|
||||
series = 'lazy-dungeon-master'
|
||||
page = requests.get(baseURL + '/series/' + series + '?pg=' + str(pg))
|
||||
page = scraper.get(baseURL + '/series/' + series + '?pg=' + str(pg))
|
||||
print(baseURL + 'series/' + series + '?pg=' + str(pg))
|
||||
# pp.pprint(page.content)
|
||||
soup = bs(page.content, 'html.parser')
|
||||
# print(soup)
|
||||
result = soup.find(id='myTable')
|
||||
|
||||
# print(result)
|
||||
|
||||
for a in result.find_all('a', href=True, title=True):
|
||||
if 'extnu' in a['href']:
|
||||
print(a['title'])
|
||||
print(a['href'].strip('//'))
|
||||
followURL = 'http://' + a['href'].strip('//')
|
||||
followPage = requests.get(followURL)
|
||||
followSoup = bs(followPage.content, 'html.parser')
|
||||
# test = followSoup.find('link', href=True)
|
||||
if 'https://coronatranslation.blogspot.com/favicon.ico' in followSoup.find('link', href=True)['href']:
|
||||
story = followSoup.find('div', class_='post-body entry-content float-container')
|
||||
story_text = story.get_text()
|
||||
# print(story_text)
|
||||
f = open(a['title']+".txt", "w", encoding="utf-8")
|
||||
f.write(story_text)
|
||||
f.close()
|
||||
elif 'Ziru' in followSoup.find('link', title="Ziru's Musings » Feed")['title']:
|
||||
storyURL = followSoup.find('a', text='Read Chapter Here', href=True)['href']
|
||||
storyPage = requests.get(storyURL)
|
||||
storySoup = bs(storyPage.content, 'html.parser')
|
||||
story = storySoup.find('div', class_='elementor-element elementor-element-7ba99198 elementor-widget elementor-widget-theme-post-content')
|
||||
story_text = story.get_text()
|
||||
# print(story_text)
|
||||
f = open(a['title'] + ".txt", "w", encoding="utf-8")
|
||||
f.write(story_text)
|
||||
f.close()
|
||||
# print(a['title'])
|
||||
# print(a['href'].strip('//'))
|
||||
if os.path.exists(a['title']+'.txt'):
|
||||
print('skipping', a['title'])
|
||||
elif a['title'][0].lower() == 'c':
|
||||
print(a['title'])
|
||||
print(a['href'].strip('//'))
|
||||
|
||||
followURL = 'http://' + a['href'].strip('//')
|
||||
followPage = scraper.get(followURL)
|
||||
followSoup = bs(followPage.content, 'html.parser')
|
||||
if 'http://moonbunnycafe.com/wp-content/uploads/2015/07/bunbun.jpeg' in followSoup.find('link', href=True)['href']:
|
||||
break
|
||||
# test = followSoup.find('link', href=True)
|
||||
elif 'https://coronatranslation.blogspot.com/favicon.ico' in followSoup.find('link', href=True)['href']:
|
||||
story = followSoup.find('div', class_='post-body entry-content float-container')
|
||||
story_text = story.get_text()
|
||||
# print(story_text)
|
||||
f = open(a['title']+".txt", "w", encoding="utf-8")
|
||||
f.write(story_text)
|
||||
f.close()
|
||||
elif 'Ziru' in followSoup.find('link', title="Ziru's Musings » Feed")['title']:
|
||||
# storyURL = followSoup.find('a', text='Read Chapter Here', href=True)['href']
|
||||
# storyPage = scraper.get(storyURL)
|
||||
# storySoup = bs(storyPage.content, 'html.parser')
|
||||
# story = storySoup.find('div', class_='elementor-element elementor-element-7ba99198 elementor-widget elementor-widget-theme-post-content')
|
||||
story = followSoup.find('div', class_='elementor-element elementor-element-7ba99198 elementor-widget elementor-widget-theme-post-content')
|
||||
if story is not None:
|
||||
story_text = story.get_text()
|
||||
# print(story_text)
|
||||
f = open(a['title'] + ".txt", "w", encoding="utf-8")
|
||||
f.write(story_text)
|
||||
f.close()
|
||||
else:
|
||||
print('fuck')
|
||||
else:
|
||||
print('fuck')
|
||||
print('junk')
|
||||
pg = pg-1
|
||||
# exit()
|
||||
# exit()
|
||||
Reference in New Issue
Block a user