| Crates.io | extractous |
| lib.rs | extractous |
| version | 0.3.0 |
| created_at | 2024-09-11 14:06:34.05462+00 |
| updated_at | 2024-12-21 09:19:26.76509+00 |
| description | Extractous provides a fast and efficient way to extract content from all kind of file formats including PDF, Word, Excel CSV, Email etc... Internally it uses a natively compiled Apache Tika for formats are not supported natively by the Rust core |
| homepage | https://extractous.yobix.ai |
| repository | https://github.com/yobix-ai/extractous |
| max_upload_size | |
| id | 1371976 |
| size | 806,356 |
Extractous is a Rust crate that provides a unified approach for detecting and extracting metadata and text content from various documents types such as PDF, Word, HTML, and many other formats.
Extractor instanceuse extractous::Extractor;
use extractous::PdfParserConfig;
fn main() {
// Create a new extractor. Note it uses the consuming builder pattern
let mut extractor = Extractor::new()
.set_extract_string_max_length(1000);
// can also perform conditional configuration
let custom_pdf_config = true;
if custom_pdf_config {
extractor = extractor.set_pdf_config(
PdfParserConfig::new().set_extract_annotation_text(false)
);
}
}
Stringuse extractous::Extractor;
fn main() {
// Get the command-line arguments
let args: Vec<String> = std::env::args().collect();
let file_path = &args[1];
// Extract the provided file content to a string
let mut extractor = Extractor::new();
// if you need an xml
// extractor = extractor.set_xml_output(false);
// Extract text from a file
let (content, metadata) = extractor.extract_file_to_string(file_path).unwrap();
println!("{}", content);
println!("{:?}", metadata);
}
StreamReader and perform buffered readinguse std::io::{BufReader, Read};
// use std::fs::File; use for bytes
use extractous::Extractor;
fn main() {
// Get the command-line arguments
let args: Vec<String> = std::env::args().collect();
let file_path = &args[1];
// Extract the provided file content to a string
let extractor = Extractor::new();
let (stream, metadata) = extractor.extract_file(file_path).unwrap();
// Extract url
// let (stream, metadata) = extractor.extract_url("https://www.google.com/").unwrap();
// Extract bytes
// let mut file = File::open(file_path)?;
// let mut buffer = Vec::new();
// file.read_to_end(&mut buffer)?;
// let (stream, metadata) = extractor.extract_bytes(&file_bytes);
// Because stream implements std::io::Read trait we can perform buffered reading
// For example we can use it to create a BufReader
let mut reader = BufReader::new(stream);
let mut buffer = Vec::new();
reader.read_to_end(&mut buffer).unwrap();
println!("{}", String::from_utf8(buffer).unwrap());
println!("{:?}", metadata);
}
sudo apt install tesseract-ocr tesseract-ocr-deuParse error occurred : Unable to extract PDF content, it is most likely that OCR language pack is not installeduse extractous::Extractor;
fn main() {
let file_path = "../test_files/documents/deu-ocr.pdf";
let extractor = Extractor::new()
.set_ocr_config(TesseractOcrConfig::new().set_language("deu"))
.set_pdf_config(PdfParserConfig::new().set_ocr_strategy(PdfOcrStrategy::OCR_ONLY));
// extract file with extractor
let (content, metadata) = extractor.extract_file_to_string(file_path).unwrap();
println!("{}", content);
println!("{:?}", metadata);
}
sdk install java 23.0.1-graalceGRAALVM_HOME=$HOME/.sdkman/candidates/java/23.0.1-graalce to /etc/environmentjava -version. You should see something like:openjdk 23.0.1 2024-10-15
OpenJDK Runtime Environment GraalVM CE 23.0.1+11.1 (build 23.0.1+11-jvmci-b01)
OpenJDK 64-Bit Server VM GraalVM CE 23.0.1+11.1 (build 23.0.1+11-jvmci-b01, mixed mode, sharing)
sdk install java 24.1.1.r23-niksudo apt install tesseract-ocrsudo apt install tesseract-ocr-deu tesseract-ocr-arabrew install tesseract tesseract-langcargo buildcargo test