commit fd3776cc4aaa05a14ef496a671a65c22920ad171 Author: Dan Dembinski Date: Fri Jun 19 02:00:45 2020 -0400 Initial Commit. Sort of works, but the parsing is rough. Only works for two translation sites right now, and the one site seems to have changed it's format at some point which breaks the loop. diff --git a/main.py b/main.py new file mode 100644 index 0000000..600206a --- /dev/null +++ b/main.py @@ -0,0 +1,45 @@ +import requests +import pprint as pp +from bs4 import BeautifulSoup as bs + +pg = 27 + +while pg > 1: + + baseURL = 'https://www.novelupdates.com/' + series = 'lazy-dungeon-master' + page = requests.get(baseURL + '/series/' + series + '?pg=' + str(pg)) + # pp.pprint(page.content) + soup = bs(page.content, 'html.parser') + result = soup.find(id='myTable') + + + for a in result.find_all('a', href=True, title=True): + if 'extnu' in a['href']: + print(a['title']) + print(a['href'].strip('//')) + followURL = 'http://' + a['href'].strip('//') + followPage = requests.get(followURL) + followSoup = bs(followPage.content, 'html.parser') + # test = followSoup.find('link', href=True) + if 'https://coronatranslation.blogspot.com/favicon.ico' in followSoup.find('link', href=True)['href']: + story = followSoup.find('div', class_='post-body entry-content float-container') + story_text = story.get_text() + # print(story_text) + f = open(a['title']+".txt", "w", encoding="utf-8") + f.write(story_text) + f.close() + elif 'Ziru' in followSoup.find('link', title="Ziru's Musings ยป Feed")['title']: + storyURL = followSoup.find('a', text='Read Chapter Here', href=True)['href'] + storyPage = requests.get(storyURL) + storySoup = bs(storyPage.content, 'html.parser') + story = storySoup.find('div', class_='elementor-element elementor-element-7ba99198 elementor-widget elementor-widget-theme-post-content') + story_text = story.get_text() + # print(story_text) + f = open(a['title'] + ".txt", "w", encoding="utf-8") + f.write(story_text) + f.close() + else: + print('fuck') + pg = pg-1 + # exit() \ No newline at end of file