Crates.io | extractous |
lib.rs | extractous |
version | 0.1.7 |
source | src |
created_at | 2024-09-11 14:06:34.05462 |
updated_at | 2024-11-04 20:06:09.465865 |
description | Extractous provides a fast and efficient way to extract content from all kind of file formats including PDF, Word, Excel CSV, Email etc... Internally it uses a natively compiled Apache Tika for formats are not supported natively by the Rust core |
homepage | https://extractous.yobix.ai |
repository | https://github.com/yobix-ai/extractous |
max_upload_size | |
id | 1371976 |
size | 404,382 |
Extractous is a Rust crate that provides a unified approach for detecting and extracting metadata and text content from various documents types such as PDF, Word, HTML, and many other formats.
Extractor
instanceuse extractous::Extractor;
use extractous::PdfParserConfig;
fn main() {
// Create a new extractor. Note it uses the consuming builder pattern
let mut extractor = Extractor::new()
.set_extract_string_max_length(1000);
// can also perform conditional configuration
let custom_pdf_config = true;
if custom_pdf_config {
extractor = extractor.set_pdf_config(
PdfParserConfig::new().set_extract_annotation_text(false)
);
}
}
String
use extractous::Extractor;
fn main() {
// Get the command-line arguments
let args: Vec<String> = std::env::args().collect();
let file_path = &args[1];
// Extract the provided file content to a string
let extractor = Extractor::new();
let content = extractor.extract_file_to_string(file_path).unwrap();
println!("{}", content);
}
StreamReader
and perform buffered readinguse std::io::{BufReader, Read};
use extractous::Extractor;
fn main() {
// Get the command-line arguments
let args: Vec<String> = std::env::args().collect();
let file_path = &args[1];
// Extract the provided file content to a string
let extractor = Extractor::new();
let stream = extractor.extract_file(file_path).unwrap();
// Because stream implements std::io::Read trait we can perform buffered reading
// For example we can use it to create a BufReader
let mut reader = BufReader::new(stream);
let mut buffer = Vec::new();
reader.read_to_end(&mut buffer).unwrap();
println!("{}", String::from_utf8(buffer).unwrap())
}
sudo apt install tesseract-ocr tesseract-ocr-deu
Parse error occurred : Unable to extract PDF content
, it is most likely that OCR language pack is not installeduse extractous::Extractor;
fn main() {
let file_path = "../test_files/documents/deu-ocr.pdf";
let extractor = Extractor::new()
.set_ocr_config(TesseractOcrConfig::new().set_language("deu"))
.set_pdf_config(PdfParserConfig::new().set_ocr_strategy(PdfOcrStrategy::OCR_ONLY));
// extract file with extractor
let content = extractor.extract_file_to_string(file_path).unwrap();
println!("{}", content);
}
sdk install java 22.0.1-graalce
GRAALVM_HOME=$HOME/.sdkman/candidates/java/22.0.2-graalce
to /etc/environment
java -version
. You should see something like:openjdk 22.0.1 2024-04-16
OpenJDK Runtime Environment Liberica-NIK-24.0.1-1 (build 22.0.1+10)
OpenJDK 64-Bit Server VM Liberica-NIK-24.0.1-1 (build 22.0.1+10, mixed mode, sharing)
sdk install java 24.0.1.r22-nik
sudo apt install tesseract-ocr
sudo apt install tesseract-ocr-deu tesseract-ocr-ara
cargo build
cargo test