diff --git a/main.py b/main.py index 600206a..d92a288 100644 --- a/main.py +++ b/main.py @@ -1,45 +1,61 @@ -import requests +import cloudscraper import pprint as pp from bs4 import BeautifulSoup as bs +import os.path -pg = 27 +scraper = cloudscraper.create_scraper() +pg = 25 while pg > 1: baseURL = 'https://www.novelupdates.com/' series = 'lazy-dungeon-master' - page = requests.get(baseURL + '/series/' + series + '?pg=' + str(pg)) + page = scraper.get(baseURL + '/series/' + series + '?pg=' + str(pg)) + print(baseURL + 'series/' + series + '?pg=' + str(pg)) # pp.pprint(page.content) soup = bs(page.content, 'html.parser') + # print(soup) result = soup.find(id='myTable') - + # print(result) for a in result.find_all('a', href=True, title=True): if 'extnu' in a['href']: - print(a['title']) - print(a['href'].strip('//')) - followURL = 'http://' + a['href'].strip('//') - followPage = requests.get(followURL) - followSoup = bs(followPage.content, 'html.parser') - # test = followSoup.find('link', href=True) - if 'https://coronatranslation.blogspot.com/favicon.ico' in followSoup.find('link', href=True)['href']: - story = followSoup.find('div', class_='post-body entry-content float-container') - story_text = story.get_text() - # print(story_text) - f = open(a['title']+".txt", "w", encoding="utf-8") - f.write(story_text) - f.close() - elif 'Ziru' in followSoup.find('link', title="Ziru's Musings » Feed")['title']: - storyURL = followSoup.find('a', text='Read Chapter Here', href=True)['href'] - storyPage = requests.get(storyURL) - storySoup = bs(storyPage.content, 'html.parser') - story = storySoup.find('div', class_='elementor-element elementor-element-7ba99198 elementor-widget elementor-widget-theme-post-content') - story_text = story.get_text() - # print(story_text) - f = open(a['title'] + ".txt", "w", encoding="utf-8") - f.write(story_text) - f.close() + # print(a['title']) + # print(a['href'].strip('//')) + if os.path.exists(a['title']+'.txt'): + print('skipping', a['title']) + elif a['title'][0].lower() == 'c': + print(a['title']) + print(a['href'].strip('//')) + + followURL = 'http://' + a['href'].strip('//') + followPage = scraper.get(followURL) + followSoup = bs(followPage.content, 'html.parser') + if 'http://moonbunnycafe.com/wp-content/uploads/2015/07/bunbun.jpeg' in followSoup.find('link', href=True)['href']: + break + # test = followSoup.find('link', href=True) + elif 'https://coronatranslation.blogspot.com/favicon.ico' in followSoup.find('link', href=True)['href']: + story = followSoup.find('div', class_='post-body entry-content float-container') + story_text = story.get_text() + # print(story_text) + f = open(a['title']+".txt", "w", encoding="utf-8") + f.write(story_text) + f.close() + elif 'Ziru' in followSoup.find('link', title="Ziru's Musings » Feed")['title']: + # storyURL = followSoup.find('a', text='Read Chapter Here', href=True)['href'] + # storyPage = scraper.get(storyURL) + # storySoup = bs(storyPage.content, 'html.parser') + # story = storySoup.find('div', class_='elementor-element elementor-element-7ba99198 elementor-widget elementor-widget-theme-post-content') + story = followSoup.find('div', class_='elementor-element elementor-element-7ba99198 elementor-widget elementor-widget-theme-post-content') + if story is not None: + story_text = story.get_text() + # print(story_text) + f = open(a['title'] + ".txt", "w", encoding="utf-8") + f.write(story_text) + f.close() + else: + print('fuck') else: - print('fuck') + print('junk') pg = pg-1 - # exit() \ No newline at end of file + # exit() \ No newline at end of file