Skip to content

Commit 87b0173

Browse files
committed
Import code, add CLI
1 parent eb60f36 commit 87b0173

File tree

12 files changed

+7056
-0
lines changed

12 files changed

+7056
-0
lines changed

Cargo.lock

Lines changed: 3071 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,12 @@
1+
[workspace]
2+
resolver = "2"
3+
members = [ "html2rdf" , "html2rdf-cli"]
4+
5+
[workspace.package]
6+
authors = ["George Pollard <porges@porg.es>"]
7+
edition = "2024"
8+
keywords = ["html", "rdf", "rdfa"]
9+
license = "Apache-2.0"
10+
publish = false
11+
readme = "README.md"
12+
repository = "https://github.com/Porges/html2rdf"

deny.toml

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
[advisories]
2+
ignore = [ ]
3+
4+
[licenses]
5+
allow = [
6+
"Apache-2.0",
7+
"BSD-3-Clause",
8+
"ISC",
9+
"MIT",
10+
"MPL-2.0",
11+
"Unicode-3.0",
12+
]
13+
14+
[licenses.private]
15+
ignore = true
16+
17+
[bans]
18+
multiple-versions = "warn"

html2rdf-cli/Cargo.toml

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
[package]
2+
name = "html2rdf-cli"
3+
version = "0.1.0"
4+
authors.workspace = true
5+
edition.workspace = true
6+
keywords.workspace = true
7+
license.workspace = true
8+
publish.workspace = true
9+
readme.workspace = true
10+
repository.workspace = true
11+
12+
[[bin]]
13+
name = "html2rdf"
14+
path = "src/main.rs"
15+
16+
[dependencies]
17+
clap = { version = "4.5.38", features = ["derive", "wrap_help"] }
18+
html2rdf = { path = "../html2rdf" }
19+
oxiri = "0.2.11"
20+
oxrdf = "0.2.4"
21+
oxttl = "0.1.8"
22+
reqwest = { version = "0.12.15", features = ["blocking"] }
23+
url = "2.5.4"

html2rdf-cli/src/main.rs

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
use std::process::ExitCode;
2+
3+
use clap::Parser;
4+
5+
#[derive(Parser)]
6+
#[command(version, about)]
7+
struct Args {
8+
#[arg(value_name = "URL")]
9+
target: url::Url,
10+
}
11+
12+
fn main() -> Result<ExitCode, Box<dyn std::error::Error>> {
13+
let args = Args::parse();
14+
let client = reqwest::blocking::Client::new();
15+
let base = args.target.to_string();
16+
let base_iri = oxiri::Iri::parse(base.clone())?;
17+
let response = client.get(args.target).send()?.error_for_status()?;
18+
let content_type = response
19+
.headers()
20+
.get(reqwest::header::CONTENT_TYPE)
21+
.and_then(|v| v.to_str().ok());
22+
23+
if content_type.is_some_and(|ct| !ct.starts_with("text/html")) {
24+
eprintln!("Error: content type is not text/html.");
25+
return Ok(ExitCode::FAILURE);
26+
}
27+
28+
let content = response.text()?;
29+
let mut output_graph = oxrdf::Graph::new();
30+
let mut processor_graph = oxrdf::Graph::new();
31+
html2rdf::process(
32+
&content,
33+
base_iri.clone(),
34+
&mut output_graph,
35+
&mut processor_graph,
36+
)?;
37+
38+
{
39+
// output any warnings/errors
40+
let serializer = oxttl::TurtleSerializer::new();
41+
let mut locked_err = std::io::stderr().lock();
42+
let mut writer = serializer.for_writer(&mut locked_err);
43+
for triple in processor_graph.iter() {
44+
writer.serialize_triple(triple)?;
45+
}
46+
47+
writer.finish()?;
48+
drop(processor_graph);
49+
}
50+
51+
{
52+
// use serializer with all known prefixes
53+
let serializer = html2rdf::initial_context_prefixes().mappings().try_fold(
54+
oxttl::TurtleSerializer::new().with_base_iri(base)?,
55+
|serializer, (prefix, value)| serializer.with_prefix(prefix, value),
56+
)?;
57+
58+
let mut locked_out = std::io::stdout().lock();
59+
let mut writer = serializer.for_writer(&mut locked_out);
60+
for triple in output_graph.iter() {
61+
writer.serialize_triple(triple)?;
62+
}
63+
64+
writer.finish()?;
65+
drop(output_graph);
66+
}
67+
68+
Ok(ExitCode::SUCCESS)
69+
}

html2rdf/Cargo.toml

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
[package]
2+
name = "html2rdf"
3+
version = "0.1.0"
4+
authors.workspace = true
5+
edition.workspace = true
6+
keywords.workspace = true
7+
license.workspace = true
8+
publish.workspace = true
9+
readme.workspace = true
10+
repository.workspace = true
11+
12+
[dependencies]
13+
curie = "0.1.4"
14+
derive_more = { version = "2.0.1", features = ["display", "error", "from", "from_str"] }
15+
icu = { version = "2.0.0-beta2" }
16+
indexmap = "2.9.0"
17+
itertools = "0.14.0"
18+
oxiri = "0.2.11"
19+
oxrdf = "0.2.4"
20+
oxsdatatypes = "0.2.2"
21+
rdf-canon = "0.15.1"
22+
rxml_validation = "0.11.0"
23+
scraper = "0.23.1"
24+
vec1 = "1.12.1"
25+
26+
[dev-dependencies]
27+
insta = { version = "1.43.1", features = ["glob"] }
28+
oxttl = "0.1.8"
29+
pretty_assertions = "1.4.1"
30+
rstest = { version = "0.25.0", default-features = false, features = ["crate-name"] }
31+
sha2 = "0.10.9"

0 commit comments

Comments
 (0)