Still not good. Changes to the sites still break shit. NovelUpdates.com added cloudfare, so I had to switch from requests to cloudscrape. The Ziru's Musings site also took away a layer of linking, so I rewrote that parsing section. I'm still hitting an issue with some MoonBunny site that seems to be abandoned and breaking everything. I haven't been able to work around that site yet.

2021-03-10 00:17:59 -05:00
parent fd3776cc4a
commit 52c3c4bb7b
1 changed files with 45 additions and 29 deletions
--- a/main.py
+++ b/main.py
@@ -1,45 +1,61 @@
-import requests
+import cloudscraper
 import pprint as pp
 from bs4 import BeautifulSoup as bs
+import os.path

-pg = 27
+scraper = cloudscraper.create_scraper()
+pg = 25

 while pg > 1:

    baseURL = 'https://www.novelupdates.com/'
    series = 'lazy-dungeon-master'
-    page = requests.get(baseURL + '/series/' + series + '?pg=' + str(pg))
+    page = scraper.get(baseURL + '/series/' + series + '?pg=' + str(pg))
+    print(baseURL + 'series/' + series + '?pg=' + str(pg))
    # pp.pprint(page.content)
    soup = bs(page.content, 'html.parser')
+    # print(soup)
    result = soup.find(id='myTable')
-
+    # print(result)

    for a in result.find_all('a', href=True, title=True):
        if 'extnu' in a['href']:
-            print(a['title'])
-            print(a['href'].strip('//'))
-            followURL = 'http://' + a['href'].strip('//')
-            followPage = requests.get(followURL)
-            followSoup = bs(followPage.content, 'html.parser')
-            # test = followSoup.find('link', href=True)
-            if 'https://coronatranslation.blogspot.com/favicon.ico' in followSoup.find('link', href=True)['href']:
-                story = followSoup.find('div', class_='post-body entry-content float-container')
-                story_text = story.get_text()
-                # print(story_text)
-                f = open(a['title']+".txt", "w", encoding="utf-8")
-                f.write(story_text)
-                f.close()
-            elif 'Ziru' in followSoup.find('link', title="Ziru's Musings » Feed")['title']:
-                storyURL = followSoup.find('a', text='Read Chapter Here', href=True)['href']
-                storyPage = requests.get(storyURL)
-                storySoup = bs(storyPage.content, 'html.parser')
-                story = storySoup.find('div', class_='elementor-element elementor-element-7ba99198 elementor-widget elementor-widget-theme-post-content')
-                story_text = story.get_text()
-                # print(story_text)
-                f = open(a['title'] + ".txt", "w", encoding="utf-8")
-                f.write(story_text)
-                f.close()
+            # print(a['title'])
+            # print(a['href'].strip('//'))
+            if os.path.exists(a['title']+'.txt'):
+                print('skipping', a['title'])
+            elif a['title'][0].lower() == 'c':
+                print(a['title'])
+                print(a['href'].strip('//'))
+
+                followURL = 'http://' + a['href'].strip('//')
+                followPage = scraper.get(followURL)
+                followSoup = bs(followPage.content, 'html.parser')
+                if 'http://moonbunnycafe.com/wp-content/uploads/2015/07/bunbun.jpeg' in followSoup.find('link', href=True)['href']:
+                    break
+                # test = followSoup.find('link', href=True)
+                elif 'https://coronatranslation.blogspot.com/favicon.ico' in followSoup.find('link', href=True)['href']:
+                    story = followSoup.find('div', class_='post-body entry-content float-container')
+                    story_text = story.get_text()
+                    # print(story_text)
+                    f = open(a['title']+".txt", "w", encoding="utf-8")
+                    f.write(story_text)
+                    f.close()
+                elif 'Ziru' in followSoup.find('link', title="Ziru's Musings » Feed")['title']:
+                    # storyURL = followSoup.find('a', text='Read Chapter Here', href=True)['href']
+                    # storyPage = scraper.get(storyURL)
+                    # storySoup = bs(storyPage.content, 'html.parser')
+                    # story = storySoup.find('div', class_='elementor-element elementor-element-7ba99198 elementor-widget elementor-widget-theme-post-content')
+                    story = followSoup.find('div', class_='elementor-element elementor-element-7ba99198 elementor-widget elementor-widget-theme-post-content')
+                    if story is not None:
+                        story_text = story.get_text()
+                        # print(story_text)
+                        f = open(a['title'] + ".txt", "w", encoding="utf-8")
+                        f.write(story_text)
+                        f.close()
+                else:
+                    print('fuck')
            else:
-                print('fuck')
+                print('junk')
    pg = pg-1
-        # exit()
+            # exit()