Extracting Links
Extract all links from a webpage HTML
Use reqwest::get
to perform a HTTP GET request and then use
Document::from_read
to parse the response into a HTML document.
find
with the criteria of Name
is "a" retrieves all links.
Call filter_map
on the Selection
retrieves URLs
from links that have the "href" attr
(attribute).
mod links { use thiserror::Error; use select::document::Document; use select::predicate::Name; #[derive(Error, Debug)] pub enum LinkError { #[error("Reqwest error: {0}")] ReqError(#[from] reqwest::Error), #[error("IO error: {0}")] IoError(#[from] std::io::Error), } pub async fn get_links(page: &str) -> Result<Vec<Box<str>>, LinkError> { let res = reqwest::get(page) .await? .text() .await?; let links = Document::from(res.as_str()) .find(Name("a")) .filter_map(|node| node.attr("href")) .into_iter() .map(|link| Box::<str>::from(link.to_string())) .collect(); Ok(links) } } #[tokio::main] async fn main() -> Result<(), links::LinkError> { let page_links = links::get_links("https://www.rust-lang.org/en-US/").await?; for link in page_links { println!("{}", link); } Ok(()) }
Check a webpage for broken links
Call get_base_url
to retrieve the base URL. If the document has a base tag,
get the href attr
from base tag. Position::BeforePath
of the original
URL acts as a default.
Iterates through links in the document and creates a tokio::spawn
task that will
parse an individual link with url::ParseOptions
and Url::parse
).
The task makes a request to the links with reqwest and verifies
StatusCode
. Then the tasks await
completion before ending the program.
mod broken { use thiserror::Error; use reqwest::StatusCode; use select::document::Document; use select::predicate::Name; use std::collections::HashSet; use url::{Position, Url}; #[derive(Error, Debug)] pub enum BrokenError { #[error("Reqwest error: {0}")] ReqError(#[from] reqwest::Error), #[error("IO error: {0}")] IoError(#[from] std::io::Error), #[error("URL parse error: {0}")] UrlParseError(#[from] url::ParseError), #[error("Join error: {0}")] JoinError(#[from] tokio::task::JoinError), } pub struct CategorizedUrls { pub ok: Vec<String>, pub broken: Vec<String>, } enum Link { GoodLink(Url), BadLink(Url), } async fn get_base_url(url: &Url, doc: &Document) -> Result<Url, BrokenError> { let base_tag_href = doc.find(Name("base")).filter_map(|n| n.attr("href")).nth(0); let base_url = base_tag_href.map_or_else(|| Url::parse(&url[..Position::BeforePath]), Url::parse)?; Ok(base_url) } async fn check_link(url: &Url) -> Result<bool, BrokenError> { let res = reqwest::get(url.as_ref()).await?; Ok(res.status() != StatusCode::NOT_FOUND) } pub async fn check(site: &str) -> Result<CategorizedUrls, BrokenError> { let url = Url::parse(site)?; let res = reqwest::get(url.as_ref()).await?.text().await?; let document = Document::from(res.as_str()); let base_url = get_base_url(&url, &document).await?; let base_parser = Url::options().base_url(Some(&base_url)); let links: HashSet<Url> = document .find(Name("a")) .filter_map(|n| n.attr("href")) .filter_map(|link| base_parser.parse(link).ok()) .collect(); let mut tasks = vec![]; let mut ok = vec![]; let mut broken = vec![]; for link in links { tasks.push(tokio::spawn(async move { if check_link(&link).await.unwrap_or(false) { Link::GoodLink(link) } else { Link::BadLink(link) } })); } for task in tasks { match task.await? { Link::GoodLink(link) => ok.push(link.to_string()), Link::BadLink(link) => broken.push(link.to_string()), } } Ok(CategorizedUrls { ok, broken }) } } [tokio::main] fn main() -> anyhow::Result<()> { let categorized = broken::check("https://www.rust-lang.org/en-US/").await?; println!("OK: {:?}", categorized.ok); println!("Broken: {:?}", categorized.broken); Ok(()) }
Extract all unique links from a MediaWiki markup
Pull the source of a MediaWiki page using reqwest::get
and then
look for all entries of internal and external links with
Regex::captures_iter
. Using Cow
avoids excessive String
allocations.
MediaWiki link syntax is described here. The calling function will retain the whole document, and links will be returned as slice references to the original document.
mod wiki { use regex::Regex; use std::borrow::Cow; use std::collections::HashSet; use std::sync::LazyLock; pub fn extract_links(content: &str) -> HashSet<Cow<str>> { static WIKI_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new( r"(?x) \[\[(?P<internal>[^\[\]|]*)[^\[\]]*\]\] # internal links | (url=|URL\||\[)(?P<external>http.*?)[ \|}] # external links " ) .unwrap() ); let links: HashSet<_> = WIKI_REGEX .captures_iter(content) .map(|c| match (c.name("internal"), c.name("external")) { (Some(val), None) => Cow::from(val.as_str()), (None, Some(val)) => Cow::from(val.as_str()), _ => unreachable!(), }) .collect::<HashSet<_>>(); links } } #[tokio::main] async fn main() -> anyhow::Result<()> { let content = reqwest::get( "https://en.wikipedia.org/w/index.php?title=Rust_(programming_language)&action=raw", ) .await? .text() .await?; println!("{:#?}", wiki::extract_links(content.as_str())); Ok(()) }