https://git.nicholasjohnson.ch/nicksphere-gmi
Revision c77cc20bbeaf017b9426f2de548aefd20c8e4007 authored by Nicholas Johnson on 29 November 2021, 00:00:00 UTC, committed by Nicholas Johnson on 29 November 2021, 00:00:00 UTC
1 parent 371ee0b
Tip revision: c77cc20bbeaf017b9426f2de548aefd20c8e4007 authored by Nicholas Johnson on 29 November 2021, 00:00:00 UTC
Update submodule
Update submodule
Tip revision: c77cc20
find_broken_links.py
'''
nicksphere-gmi Generates a Gemini capsule from existing files
Copyright (C) 2021 Nicholas Johnson
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <https://www.gnu.org/licenses/>.
'''
import requests
from urllib.parse import urljoin, urlparse
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor
import os
import sys
from pathlib import Path
from threading import Thread
from http.server import SimpleHTTPRequestHandler, HTTPServer
from config import website_config
def start_http_server(host, port, path):
os.chdir(path)
class SilentSimpleHTTPRequestHandler(SimpleHTTPRequestHandler):
def log_message(self, format, *args):
return
server = HTTPServer((host, port), SilentSimpleHTTPRequestHandler)
try:
t = Thread(target=server.serve_forever)
t.daemon = True
t.start()
except KeyboardInterrupt:
server.shutdown()
return server
def get_links(root):
unexplored = {root}
explored = set()
def _is_relative_path(url):
return not bool(urlparse(url).netloc)
def _is_absolute(url):
return root not in url
while(bool(unexplored)):
url = unexplored.pop()
if _is_absolute(url):
explored.add(url)
continue
try:
data = requests.get(url).text
except requests.exceptions.InvalidSchema:
continue
except requests.exceptions.ConnectionError:
pass
explored.add(url)
html = BeautifulSoup(data, features="html.parser")
for atag in html.find_all("a"):
link = atag.get("href")
if _is_relative_path(link): link = urljoin(url, link)
link = urljoin(root, link)
if not link in explored: unexplored.add(link)
return explored
def find_broken_links(links):
def _is_broken(url):
try:
head = requests.head(url)
if head.status_code == 404:
return url
else:
return False
except requests.exceptions.ConnectionError:
return url
except requests.exceptions.InvalidSchema:
return False
count = 0
with ThreadPoolExecutor(max_workers=8) as executor:
for link in executor.map(_is_broken, links):
if link:
count += 1
yield link
return count
def main():
class Generator:
def __init__(self, gen):
self.gen = gen
def __iter__(self):
self.value = yield from self.gen
rootdir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
sitepath = os.path.join(rootdir, 'website')
print("\033[1;32m[+] Starting HTTP server...")
server = start_http_server('127.0.0.1', 8000, sitepath)
print("[+] HTTP server started successfully.")
print("[+] Crawling webpages...")
links = get_links('http://127.0.0.1:8000')
print(f"[+] Finished crawling. {len(links)} links detected.")
print("[+] Now printing broken links...\033[31;1m")
wrapped_generator = Generator(find_broken_links(links))
for link in wrapped_generator:
print("[-] " + link)
print(f"\033[1;32m[+] Finished printing with a total of \
{wrapped_generator.value} broken links.\033[1;0m")
server.shutdown()
if __name__ == '__main__':
try:
main()
except KeyboardInterrupt:
print("\033[1;0m")
try:
sys.exit(0)
except SystemExit:
os._exit(0)

Computing file changes ...