rivermont · lkotlus · Nov 1, 2021 · Apr 28, 2022 · Apr 28, 2022 · Aug 1, 2024
diff --git a/README.md b/README.md
@@ -13,8 +13,8 @@ Pretty simple!
 ![All Platforms!](https://img.shields.io/badge/Windows,%20OS/X,%20Linux-%20%20-brightgreen.svg)
 ![Open Source Love](https://badges.frapsoft.com/os/v1/open-source.png?v=103)
 <br>
-![Lines of Code: 1553](https://img.shields.io/badge/lines%20of%20code-1553-brightgreen.svg)
-![Lines of Docs: 605](https://img.shields.io/badge/lines%20of%20docs-605-orange.svg)
+![Lines of Code: 1811](https://img.shields.io/badge/lines%20of%20code-1553-brightgreen.svg)
+![Lines of Docs: 619](https://img.shields.io/badge/lines%20of%20docs-605-orange.svg)
 [![Last Commit](https://img.shields.io/github/last-commit/rivermont/spidy.svg)](https://github.com/rivermont/spidy/graphs/punch-card)
 [![Travis CI Status](https://img.shields.io/travis/com/rivermont/spidy)](https://travis-ci.com/github/rivermont/spidy)
 [![PyPI Wheel](https://img.shields.io/pypi/wheel/spidy-web-crawler.svg)](https://pypi.org/project/spidy-web-crawler/)
@@ -101,6 +101,7 @@ Here are some features we figure are worth noting.
   - Cross-Platform compatibility: spidy will work on all three major operating systems, Windows, Mac OS/X, and Linux!
   - Frequent Timestamp Logging: Spidy logs almost every action it takes to both the console and one of two log files.
   - Browser Spoofing: Make requests using User Agents from 4 popular web browsers, use a custom spidy bot one, or create your own!
+  - Headless Browser Support: Render full webpages to get dynamic content.
   - Portability: Move spidy's folder and its contents somewhere else and it will run right where it left off. *Note*: This only works if you run it from source code.
   - User-Friendly Logs: Both the console and log file messages are simple and easy to interpret, but packed with information.
   - Webpage saving: Spidy downloads each page that it runs into, regardless of file type. The crawler uses the HTTP `Content-Type` header returned with most files to determine the file type.
@@ -225,6 +226,7 @@ See the [`CONTRIBUTING.md`](https://github.com/rivermont/spidy/blob/master/spidy
 * [quatroka](https://github.com/quatroka) - Fixed testing bugs.
 * [stevelle](https://github.com/stevelle) - Respect robots.txt.
 * [thatguywiththatname](https://github.com/thatguywiththatname) - README link corrections.
+* [lkotlus](https://github.com/lkotlus) - Optimizations, out of scope items, and headless browser support.
 
 # License
 We used the [Gnu General Public License](https://www.gnu.org/licenses/gpl-3.0.en.html) (see [`LICENSE`](https://github.com/rivermont/spidy/blob/master/LICENSE)) as it was the license that best suited our needs.<br>

diff --git a/requirements.txt b/requirements.txt
@@ -2,3 +2,6 @@ requests
 lxml
 flake8
 reppy
+selenium
+selenium-wire
+blinker==1.7.0
diff --git a/spidy/config/blank.cfg b/spidy/config/blank.cfg
@@ -28,6 +28,9 @@ RESTRICT = <True/False>
 # The domain within which to restrict crawling.
 DOMAIN = ''
 
+# Domains, subdomains, and paths that are out of scope for the crawl
+OUT_OF_SCOPE = ['', '']
+
 # Whether to respect sites' robots.txt or not
 RESPECT_ROBOTS = <True/False>
 
@@ -48,11 +51,17 @@ HEADER = HEADERS['<Header>']
 # Or if you want to use custom headers:
 HEADER = {'<Header Name>': '<Value>', '<Header2>': '<Value2>'}
 
+# Select if you would like to have pages rendered with a headless browser (more thorough, but slower)
+USE_BROWSER = <True/False>
+
 # Amount of errors allowed to happen before automatic shutdown.
 MAX_NEW_ERRORS = <Int>
 MAX_KNOWN_ERRORS = <Int>
 MAX_HTTP_ERRORS = <Int>
 MAX_NEW_MIMES = <Int>
 
+# Amount of time (in seconds) the crawl is allowed to run for (set to float('inf') if you want it to run forever)
+MAX_TIME = <Int>
+
 # Pages to start crawling on in case TODO is empty at start.
 START = ['', '']
diff --git a/spidy/config/default.cfg b/spidy/config/default.cfg
@@ -7,14 +7,17 @@ ZIP_FILES = True
 OVERRIDE_SIZE = False
 RESTRICT = False
 DOMAIN = ''
+OUT_OF_SCOPE = []
 RESPECT_ROBOTS = True
 TODO_FILE = 'crawler_todo.txt'
 DONE_FILE = 'crawler_done.txt'
 WORD_FILE = 'crawler_words.txt'
 SAVE_COUNT = 100
 HEADER = HEADERS['spidy']
+USE_BROWSER = False
 MAX_NEW_ERRORS = 5
 MAX_KNOWN_ERRORS = 10
 MAX_HTTP_ERRORS = 20
 MAX_NEW_MIMES = 10
+MAX_TIME = float('inf')
 START = ['https://en.wikipedia.org/wiki/Main_Page']
diff --git a/spidy/config/docker.cfg b/spidy/config/docker.cfg
@@ -7,14 +7,17 @@ ZIP_FILES = True
 OVERRIDE_SIZE = False
 RESTRICT = False
 DOMAIN = ''
+OUT_OF_SCOPE = []
 RESPECT_ROBOTS = True
 TODO_FILE = '/data/crawler_todo.txt'
 DONE_FILE = '/data/crawler_done.txt'
 WORD_FILE = '/data/crawler_words.txt'
 SAVE_COUNT = 100
 HEADER = HEADERS['spidy']
+USE_BROWSER = False
 MAX_NEW_ERRORS = 5
 MAX_KNOWN_ERRORS = 10
 MAX_HTTP_ERRORS = 20
 MAX_NEW_MIMES = 10
+MAX_TIME = float('inf')
 START = ['https://en.wikipedia.org/wiki/Main_Page']
diff --git a/spidy/config/heavy.cfg b/spidy/config/heavy.cfg
@@ -7,14 +7,17 @@ ZIP_FILES = True
 OVERRIDE_SIZE = True
 RESTRICT = False
 DOMAIN = ''
+OUT_OF_SCOPE = []
 RESPECT_ROBOTS = False
 TODO_FILE = 'crawler_todo.txt'
 DONE_FILE = 'crawler_done.txt'
 WORD_FILE = 'crawler_words.txt'
 SAVE_COUNT = 100
 HEADER = HEADERS['spidy']
+USE_BROWSER = True
 MAX_NEW_ERRORS = 5
 MAX_KNOWN_ERRORS = 10
 MAX_HTTP_ERRORS = 20
 MAX_NEW_MIMES = 10
+MAX_TIME = float('inf')
 START = ['https://en.wikipedia.org/wiki/Main_Page']
diff --git a/spidy/config/infinite.cfg b/spidy/config/infinite.cfg
@@ -7,14 +7,17 @@ ZIP_FILES = True
 OVERRIDE_SIZE = False
 RESTRICT = False
 DOMAIN = ''
+OUT_OF_SCOPE = []
 RESPECT_ROBOTS = True
 TODO_FILE = 'crawler_todo.txt'
 DONE_FILE = 'crawler_done.txt'
 WORD_FILE = 'crawler_words.txt'
 SAVE_COUNT = 250
 HEADER = HEADERS['spidy']
+USE_BROWSER = False
 MAX_NEW_ERRORS = 1000000
 MAX_KNOWN_ERRORS = 1000000
 MAX_HTTP_ERRORS = 1000000
 MAX_NEW_MIMES = 1000000
+MAX_TIME = float('inf')
 START = ['https://en.wikipedia.org/wiki/Main_Page']
diff --git a/spidy/config/light.cfg b/spidy/config/light.cfg
@@ -7,14 +7,17 @@ OVERRIDE_SIZE = False
 SAVE_WORDS = False
 RESTRICT = False
 DOMAIN = ''
+OUT_OF_SCOPE = []
 RESPECT_ROBOTS = True
 TODO_FILE = 'crawler_todo.txt'
 DONE_FILE = 'crawler_done.txt'
 WORD_FILE = 'crawler_words.txt'
 SAVE_COUNT = 150
 HEADER = HEADERS['spidy']
+USE_BROWSER = False
 MAX_NEW_ERRORS = 5
 MAX_KNOWN_ERRORS = 10
 MAX_HTTP_ERRORS = 20
 MAX_NEW_MIMES = 10
+MAX_TIME = 600
 START = ['https://en.wikipedia.org/wiki/Main_Page']
diff --git a/spidy/config/multithreaded.cfg b/spidy/config/multithreaded.cfg
@@ -7,14 +7,17 @@ ZIP_FILES = True
 OVERRIDE_SIZE = False
 RESTRICT = False
 DOMAIN = ''
+OUT_OF_SCOPE = []
 RESPECT_ROBOTS = False
 TODO_FILE = 'crawler_todo.txt'
 DONE_FILE = 'crawler_done.txt'
 WORD_FILE = 'crawler_words.txt'
 SAVE_COUNT = 100
 HEADER = HEADERS['spidy']
+USE_BROWSER = False
 MAX_NEW_ERRORS = 5
 MAX_KNOWN_ERRORS = 10
 MAX_HTTP_ERRORS = 20
 MAX_NEW_MIMES = 10
+MAX_TIME = float('inf')
 START = ['https://en.wikipedia.org/wiki/Main_Page']
diff --git a/spidy/config/rivermont-infinite.cfg b/spidy/config/rivermont-infinite.cfg
diff --git a/spidy/config/rivermont.cfg b/spidy/config/rivermont.cfg
diff --git a/spidy/config/wsj.cfg b/spidy/config/wsj.cfg
@@ -12,14 +12,19 @@ RESTRICT = True
 # The domain within which to restrict crawling.
 DOMAIN = 'wsj.com/'
 
+# Do not allow crawling involving specific pages and subdomains
+OUT_OF_SCOPE = ['wsj.com/business/airlines', 'africa.wsj.com']
+
 RESPECT_ROBOTS = True
 TODO_FILE = 'wsj_todo.txt'
 DONE_FILE = 'wsj_done.txt'
 WORD_FILE = 'wsj_words.txt'
 SAVE_COUNT = 60
 HEADER = HEADERS['spidy']
+USE_BROWSER = False
 MAX_NEW_ERRORS = 100
 MAX_KNOWN_ERRORS = 100
 MAX_HTTP_ERRORS = 100
 MAX_NEW_MIMES = 5
+MAX_TIME = float('inf')
 START = ['https://www.wsj.com/']