https://git.nicholasjohnson.ch/nicksphere-gmi
Revision c77cc20bbeaf017b9426f2de548aefd20c8e4007 authored by Nicholas Johnson on 29 November 2021, 00:00:00 UTC, committed by Nicholas Johnson on 29 November 2021, 00:00:00 UTC
1 parent 371ee0b
Raw File
Tip revision: c77cc20bbeaf017b9426f2de548aefd20c8e4007 authored by Nicholas Johnson on 29 November 2021, 00:00:00 UTC
Update submodule
Tip revision: c77cc20
find_broken_links.py
'''
    nicksphere-gmi  Generates a Gemini capsule from existing files
    Copyright (C) 2021  Nicholas Johnson

    This program is free software: you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation, either version 3 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program.  If not, see <https://www.gnu.org/licenses/>.
'''

import requests
from urllib.parse import urljoin, urlparse
from bs4 import BeautifulSoup
from concurrent.futures import ThreadPoolExecutor
import os
import sys
from pathlib import Path
from threading import Thread
from http.server import SimpleHTTPRequestHandler, HTTPServer
from config import website_config


def start_http_server(host, port, path):
    os.chdir(path)

    class SilentSimpleHTTPRequestHandler(SimpleHTTPRequestHandler):
        def log_message(self, format, *args):
            return

    server = HTTPServer((host, port), SilentSimpleHTTPRequestHandler)

    try:
        t = Thread(target=server.serve_forever)
        t.daemon = True
        t.start()
    except KeyboardInterrupt:
        server.shutdown()

    return server


def get_links(root):
    unexplored = {root}
    explored = set()

    def _is_relative_path(url):
        return not bool(urlparse(url).netloc)

    def _is_absolute(url):
        return root not in url

    while(bool(unexplored)):
        url = unexplored.pop()

        if _is_absolute(url):
            explored.add(url)
            continue

        try:
            data = requests.get(url).text
        except requests.exceptions.InvalidSchema:
            continue
        except requests.exceptions.ConnectionError:
            pass

        explored.add(url)

        html = BeautifulSoup(data, features="html.parser")

        for atag in html.find_all("a"):
            link = atag.get("href")
            if _is_relative_path(link): link = urljoin(url, link)
            link = urljoin(root, link)
            if not link in explored: unexplored.add(link)

    return explored


def find_broken_links(links):
    def _is_broken(url):
        try:
            head = requests.head(url)

            if head.status_code == 404:
                return url
            else:
                return False
        except requests.exceptions.ConnectionError:
            return url
        except requests.exceptions.InvalidSchema:
            return False

    count = 0

    with ThreadPoolExecutor(max_workers=8) as executor:
        for link in executor.map(_is_broken, links):
            if link:
                count += 1
                yield link

    return count

def main():
    class Generator:
        def __init__(self, gen):
            self.gen = gen

        def __iter__(self):
            self.value = yield from self.gen

    rootdir = os.path.dirname(os.path.dirname(os.path.realpath(__file__)))
    sitepath = os.path.join(rootdir, 'website')

    print("\033[1;32m[+] Starting HTTP server...")
    server = start_http_server('127.0.0.1', 8000, sitepath)
    print("[+] HTTP server started successfully.")
    print("[+] Crawling webpages...")
    links = get_links('http://127.0.0.1:8000')
    print(f"[+] Finished crawling. {len(links)} links detected.")
    print("[+] Now printing broken links...\033[31;1m")
    wrapped_generator = Generator(find_broken_links(links))
    for link in wrapped_generator:
        print("[-] " + link)
    print(f"\033[1;32m[+] Finished printing with a total of \
{wrapped_generator.value} broken links.\033[1;0m")

    server.shutdown()


if __name__ == '__main__':
    try:
        main()
    except KeyboardInterrupt:
        print("\033[1;0m")
        try:
            sys.exit(0)
        except SystemExit:
            os._exit(0)
back to top