https://github.com/cran/XML
Raw File
Tip revision: 2dea04a1ef06f53429143cf07f247203a6e73f8b authored by CRAN Team on 29 November 2023, 11:55:26 UTC
version 3.99-0.16
Tip revision: 2dea04a
getHTMLLinks.Rd
\name{getHTMLLinks}
\alias{getHTMLLinks}
\alias{getHTMLExternalFiles}
\title{Get links or names of external files in HTML document} 
\description{
  These functions allow us to retrieve either the links
  within an HTML document, or the collection of names of
  external files referenced in an HTML document.
  The external files include images, JavaScript and CSS documents.
}
\usage{
getHTMLLinks(doc, externalOnly = TRUE, xpQuery = "//a/@href",
               baseURL = docName(doc), relative = FALSE)
getHTMLExternalFiles(doc, xpQuery = c("//img/@src", "//link/@href",
                                      "//script/@href", "//embed/@src"),
                     baseURL = docName(doc), relative = FALSE,
                     asNodes = FALSE, recursive = FALSE)
}
%- maybe also 'usage' for other objects documented here.
\arguments{
  \item{doc}{the HTML document as a URL, local file name, parsed
    document or an XML/HTML node}
  \item{externalOnly}{a logical value that indicates whether we should
    only return links to external documents and not references to
    internal anchors/nodes within this document, i.e. those that of the
    form \code{#foo}.}
  \item{xpQuery}{a vector of XPath elements which match the elements of interest}
\item{baseURL}{the URL of the container document. This is used
    to resolve relative references/links.
}
\item{relative}{a logical value indicating whether to leave the
  references as relative to the base URL or to expand them to their full paths.
}
\item{asNodes}{a logical value that indicates whether we want the actual
  HTML/XML   nodes in the document that reference external documents
  or just the names of the external documents.}
\item{recursive}{a logical value that controls whether we recursively
  process the external documents we find in the top-level document
  examining them for their external files.}
}

\value{
  \code{getHTMLLinks} returns a character vector of the links.

  \code{getHTMLExternalFiles} returns a character vector.
}
\author{
Duncan Temple Lang
}
\seealso{
  \code{\link{getXIncludes}}
}
\examples{\donttest{ # site is flaky
  try(getHTMLLinks("https://www.omegahat.net"))

  try(getHTMLLinks("https://www.omegahat.net/RSXML"))

  try(unique(getHTMLExternalFiles("https://www.omegahat.net")))
}}
\keyword{IO}
\keyword{programming}

back to top