Extracting Links

Extract all links from a webpage HTML

reqwest-badge select-badge cat-net-badge

Use reqwest::get to perform a HTTP GET request and then use Document::from_read to parse the response into a HTML document. find with the criteria of Name is "a" retrieves all links. Call filter_map on the Selection retrieves URLs from links that have the "href" attr (attribute).

mod links {
  use thiserror::Error;
use select::document::Document;
use select::predicate::Name;

#[derive(Error, Debug)]
pub enum LinkError {
    #[error("Reqwest error: {0}")]
    ReqError(#[from] reqwest::Error),
    #[error("IO error: {0}")]
    IoError(#[from] std::io::Error),
}

pub async fn get_links(page: &str) -> Result<Vec<Box<str>>, LinkError> {
  let res = reqwest::get(page)
    .await?
    .text()
    .await?;

  let links = Document::from(res.as_str())
    .find(Name("a"))
    .filter_map(|node| node.attr("href"))
    .into_iter()
    .map(|link| Box::<str>::from(link.to_string()))
    .collect();

  Ok(links)
}

}

#[tokio::main]
async fn main() -> Result<(), links::LinkError> {
    let page_links = links::get_links("https://www.rust-lang.org/en-US/").await?;
    for link in page_links {
        println!("{}", link);
    }
    Ok(())
}

Check a webpage for broken links

reqwest-badge select-badge url-badge cat-net-badge

Call get_base_url to retrieve the base URL. If the document has a base tag, get the href attr from base tag. Position::BeforePath of the original URL acts as a default.

Iterates through links in the document and creates a tokio::spawn task that will parse an individual link with url::ParseOptions and Url::parse). The task makes a request to the links with reqwest and verifies StatusCode. Then the tasks await completion before ending the program.

mod broken {
  use thiserror::Error;
use reqwest::StatusCode;
use select::document::Document;
use select::predicate::Name;
use std::collections::HashSet;
use url::{Position, Url};

#[derive(Error, Debug)]
pub enum BrokenError {
    #[error("Reqwest error: {0}")]
    ReqError(#[from] reqwest::Error),
    #[error("IO error: {0}")]
    IoError(#[from] std::io::Error),
    #[error("URL parse error: {0}")]
    UrlParseError(#[from] url::ParseError),
    #[error("Join error: {0}")]
    JoinError(#[from] tokio::task::JoinError),
}

pub struct CategorizedUrls {
    pub ok: Vec<String>,
    pub broken: Vec<String>,
}

enum Link {
    GoodLink(Url),
    BadLink(Url),
}

async fn get_base_url(url: &Url, doc: &Document) -> Result<Url, BrokenError> {
  let base_tag_href = doc.find(Name("base")).filter_map(|n| n.attr("href")).nth(0);
  let base_url =
    base_tag_href.map_or_else(|| Url::parse(&url[..Position::BeforePath]), Url::parse)?;
  Ok(base_url)
}

async fn check_link(url: &Url) -> Result<bool, BrokenError> {
  let res = reqwest::get(url.as_ref()).await?;
  Ok(res.status() != StatusCode::NOT_FOUND)
}

pub async fn check(site: &str) -> Result<CategorizedUrls, BrokenError> {
  let url = Url::parse(site)?;
  let res = reqwest::get(url.as_ref()).await?.text().await?;
  let document = Document::from(res.as_str());
  let base_url = get_base_url(&url, &document).await?;
  let base_parser = Url::options().base_url(Some(&base_url));
  let links: HashSet<Url> = document
    .find(Name("a"))
    .filter_map(|n| n.attr("href"))
    .filter_map(|link| base_parser.parse(link).ok())
    .collect();
    let mut tasks = vec![];
    let mut ok = vec![];
    let mut broken = vec![];

    for link in links {
        tasks.push(tokio::spawn(async move {
            if check_link(&link).await.unwrap_or(false) {
                Link::GoodLink(link) 
            } else {
                Link::BadLink(link)
            }
        }));
    }

    for task in tasks {
        match task.await? {
            Link::GoodLink(link) => ok.push(link.to_string()),
            Link::BadLink(link) => broken.push(link.to_string()),
        }
    }

  Ok(CategorizedUrls { ok, broken })
}

}

[tokio::main]
fn main() -> anyhow::Result<()> {
    let categorized = broken::check("https://www.rust-lang.org/en-US/").await?;
    println!("OK: {:?}", categorized.ok);
    println!("Broken: {:?}", categorized.broken);
    Ok(())
}

Extract all unique links from a MediaWiki markup

reqwest-badge regex-badge cat-net-badge

Pull the source of a MediaWiki page using reqwest::get and then look for all entries of internal and external links with Regex::captures_iter. Using Cow avoids excessive String allocations.

MediaWiki link syntax is described here. The calling function will retain the whole document, and links will be returned as slice references to the original document.

mod wiki {
  use regex::Regex;
use std::borrow::Cow;
use std::collections::HashSet;
use std::sync::LazyLock;

pub fn extract_links(content: &str) -> HashSet<Cow<str>> {
  static WIKI_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(
      r"(?x)
                \[\[(?P<internal>[^\[\]|]*)[^\[\]]*\]\]    # internal links
                |
                (url=|URL\||\[)(?P<external>http.*?)[ \|}] # external links
            "
    )
    .unwrap()
  );

  let links: HashSet<_> = WIKI_REGEX
    .captures_iter(content)
    .map(|c| match (c.name("internal"), c.name("external")) {
        (Some(val), None) => Cow::from(val.as_str()),
        (None, Some(val)) => Cow::from(val.as_str()),
        _ => unreachable!(),
    })
    .collect::<HashSet<_>>();

  links
}
}

#[tokio::main]
async fn main() -> anyhow::Result<()> {
  let content = reqwest::get(
    "https://en.wikipedia.org/w/index.php?title=Rust_(programming_language)&action=raw",
  )
  .await?
  .text()
  .await?;

  println!("{:#?}", wiki::extract_links(content.as_str()));

  Ok(())
}