61 lines
2.8 KiB
Python
61 lines
2.8 KiB
Python
import cloudscraper
|
|
import pprint as pp
|
|
from bs4 import BeautifulSoup as bs
|
|
import os.path
|
|
|
|
scraper = cloudscraper.create_scraper()
|
|
pg = 25
|
|
|
|
while pg > 1:
|
|
|
|
baseURL = 'https://www.novelupdates.com/'
|
|
series = 'lazy-dungeon-master'
|
|
page = scraper.get(baseURL + '/series/' + series + '?pg=' + str(pg))
|
|
print(baseURL + 'series/' + series + '?pg=' + str(pg))
|
|
# pp.pprint(page.content)
|
|
soup = bs(page.content, 'html.parser')
|
|
# print(soup)
|
|
result = soup.find(id='myTable')
|
|
# print(result)
|
|
|
|
for a in result.find_all('a', href=True, title=True):
|
|
if 'extnu' in a['href']:
|
|
# print(a['title'])
|
|
# print(a['href'].strip('//'))
|
|
if os.path.exists(a['title']+'.txt'):
|
|
print('skipping', a['title'])
|
|
elif a['title'][0].lower() == 'c':
|
|
print(a['title'])
|
|
print(a['href'].strip('//'))
|
|
|
|
followURL = 'http://' + a['href'].strip('//')
|
|
followPage = scraper.get(followURL)
|
|
followSoup = bs(followPage.content, 'html.parser')
|
|
if 'http://moonbunnycafe.com/wp-content/uploads/2015/07/bunbun.jpeg' in followSoup.find('link', href=True)['href']:
|
|
break
|
|
# test = followSoup.find('link', href=True)
|
|
elif 'https://coronatranslation.blogspot.com/favicon.ico' in followSoup.find('link', href=True)['href']:
|
|
story = followSoup.find('div', class_='post-body entry-content float-container')
|
|
story_text = story.get_text()
|
|
# print(story_text)
|
|
f = open(a['title']+".txt", "w", encoding="utf-8")
|
|
f.write(story_text)
|
|
f.close()
|
|
elif 'Ziru' in followSoup.find('link', title="Ziru's Musings » Feed")['title']:
|
|
# storyURL = followSoup.find('a', text='Read Chapter Here', href=True)['href']
|
|
# storyPage = scraper.get(storyURL)
|
|
# storySoup = bs(storyPage.content, 'html.parser')
|
|
# story = storySoup.find('div', class_='elementor-element elementor-element-7ba99198 elementor-widget elementor-widget-theme-post-content')
|
|
story = followSoup.find('div', class_='elementor-element elementor-element-7ba99198 elementor-widget elementor-widget-theme-post-content')
|
|
if story is not None:
|
|
story_text = story.get_text()
|
|
# print(story_text)
|
|
f = open(a['title'] + ".txt", "w", encoding="utf-8")
|
|
f.write(story_text)
|
|
f.close()
|
|
else:
|
|
print('fuck')
|
|
else:
|
|
print('junk')
|
|
pg = pg-1
|
|
# exit() |