diff --git a/img/diagram-example.png b/img/diagram-example.png new file mode 100644 index 0000000..c53e07e Binary files /dev/null and b/img/diagram-example.png differ diff --git a/output.txt b/output.txt deleted file mode 100644 index f1ba712..0000000 --- a/output.txt +++ /dev/null @@ -1,332 +0,0 @@ -Found 166 textline image components. -BorrowedBox(0x5649038d9ed0), confidence: 34, text: ldeas | talt.tech - -BorrowedBox(0x5649038da200), confidence: 10, text: . - -BorrowedBox(0x5649038da0e0), confidence: 74, text: Home | tait.tech - -BorrowedBox(0x5649038da140), confidence: 10, text: . - -BorrowedBox(0x5649038da750), confidence: 73, text: O lait Hoyem - Dashboard - -BorrowedBox(0x5649038d9fd0), confidence: 10, text: . - -BorrowedBox(0x5649038d9e50), confidence: 84, text: () Problem loading page - -BorrowedBox(0x5649038d9e70), confidence: 10, text: . - -BorrowedBox(0x5649038d9e90), confidence: 81, text: () leptess/low_level_ocr_v - -BorrowedBox(0x5649038da040), confidence: 10, text: . - -BorrowedBox(0x5649038da060), confidence: 73, text: &) Transcript of the Proclar - -BorrowedBox(0x5649038da080), confidence: 13, text: . - -BorrowedBox(0x564901555e80), confidence: 46, text: -+ - -BorrowedBox(0x564901555ea0), confidence: 22, text: b4 - -BorrowedBox(0x564901555ec0), confidence: 51, text: Whereas. - -BorrowedBox(0x5649038d0d10), confidence: 23, text: Ol - -BorrowedBox(0x5649038d0d30), confidence: 83, text: the - -BorrowedBox(0x5649038d0d50), confidence: 44, text: twenty-secona - -BorrowedBox(0x5649038d9bd0), confidence: 53, text: aay - -BorrowedBox(0x5649038d9bf0), confidence: 82, text: Of - -BorrowedBox(0x5649038d9c10), confidence: 88, text: September, - -BorrowedBox(0x5649038d9c30), confidence: 78, text: N - -BorrowedBox(0x5649038d9cb0), confidence: 89, text: the - -BorrowedBox(0x5649038d9cd0), confidence: 74, text: yedr - -BorrowedBox(0x5649038d9cf0), confidence: 82, text: Of - -BorrowedBox(0x5649038d9d10), confidence: 0, text: -BorrowedBox(0x56490155a6a0), confidence: 57, text: Our - -BorrowedBox(0x56490155a6c0), confidence: 65, text: L Oro - -BorrowedBox(0x56490155a6e0), confidence: 24, text: Orie - -BorrowedBox(0x56490155a700), confidence: 5, text: thousanao - -BorrowedBox(0x56490155a720), confidence: 50, text: elgnt - -BorrowedBox(0x56490395a180), confidence: 44, text: Nnunareo - -BorrowedBox(0x56490395a1a0), confidence: 54, text: =1ale - -BorrowedBox(0x56490395a1c0), confidence: 38, text: SIXty-two, - -BorrowedBox(0x56490395a1e0), confidence: 82, text: d - -BorrowedBox(0x56490395a200), confidence: 89, text: proclamation - -BorrowedBox(0x5649038d9d90), confidence: 62, text: Was - -BorrowedBox(0x5649038d9db0), confidence: 0, text: ISSueq - -BorrowedBox(0x5649038d9dd0), confidence: 68, text: Dy - -BorrowedBox(0x5649038d9df0), confidence: 89, text: the - -BorrowedBox(0x5649038d9e10), confidence: 14, text: Presiaent - -BorrowedBox(0x5649038d1d70), confidence: 82, text: Of - -BorrowedBox(0x5649038d1d90), confidence: 83, text: the - -BorrowedBox(0x5649038d1db0), confidence: 41, text: Lniteao - -BorrowedBox(0x5649038d1dd0), confidence: 80, text: States. - -BorrowedBox(0x5649038d1df0), confidence: 89, text: containing, - -BorrowedBox(0x5649038d1e10), confidence: 37, text: dlTOng - -BorrowedBox(0x5649038d1e30), confidence: 86, text: other - -BorrowedBox(0x564903955b30), confidence: 90, text: things, - -BorrowedBox(0x564903955b50), confidence: 89, text: the - -BorrowedBox(0x564903955b70), confidence: 85, text: following, - -BorrowedBox(0x564903955b90), confidence: 61, text: 10 - -BorrowedBox(0x564903955bb0), confidence: 33, text: WITt: - -BorrowedBox(0x564903955bd0), confidence: 68, text: "1 hat - -BorrowedBox(0x564903955bf0), confidence: 64, text: Ol - -BorrowedBox(0x564903955c10), confidence: 87, text: the - -BorrowedBox(0x56490284df10), confidence: 63, text: first - -BorrowedBox(0x56490284df30), confidence: 43, text: aay - -BorrowedBox(0x56490284df50), confidence: 89, text: Of - -BorrowedBox(0x56490284df70), confidence: 90, text: January, - -BorrowedBox(0x56490284df90), confidence: 65, text: N - -BorrowedBox(0x56490284dfb0), confidence: 88, text: the - -BorrowedBox(0x56490284dfd0), confidence: 59, text: yedar - -BorrowedBox(0x56490284dff0), confidence: 82, text: Of - -BorrowedBox(0x564903961e20), confidence: 86, text: Our - -BorrowedBox(0x564903961e40), confidence: 44, text: | Orao - -BorrowedBox(0x564903961e60), confidence: 66, text: Ore - -BorrowedBox(0x564903961e80), confidence: 22, text: thousana - -BorrowedBox(0x564903961ea0), confidence: 49, text: elgnht - -BorrowedBox(0x564903961ec0), confidence: 33, text: Nnunareo - -BorrowedBox(0x564903961ee0), confidence: 58, text: ana - -BorrowedBox(0x564903961f00), confidence: 34, text: Sixty-three, - -BorrowedBox(0x564903961f20), confidence: 49, text: || - -BorrowedBox(0x56490394b740), confidence: 29, text: PEIrSOns - -BorrowedBox(0x56490394b760), confidence: 56, text: nelo - -BorrowedBox(0x56490394b780), confidence: 63, text: as - -BorrowedBox(0x56490394b7a0), confidence: 72, text: glaves - -BorrowedBox(0x56490394b7c0), confidence: 41, text: within - -BorrowedBox(0x56490394b7e0), confidence: 40, text: dally - -BorrowedBox(0x56490394b800), confidence: 88, text: State - -BorrowedBox(0x56490394b820), confidence: 62, text: OFr - -BorrowedBox(0x56490394b840), confidence: 36, text: designatead - -BorrowedBox(0x5649027f6300), confidence: 90, text: part - -BorrowedBox(0x5649027f6320), confidence: 89, text: Of - -BorrowedBox(0x5649027f6340), confidence: 67, text: A - -BorrowedBox(0x5649027f6360), confidence: 88, text: State, - -BorrowedBox(0x5649027f6380), confidence: 84, text: the - -BorrowedBox(0x5649027f63a0), confidence: 89, text: people - -BorrowedBox(0x5649027f63c0), confidence: 88, text: whereof - -BorrowedBox(0x5649027f63e0), confidence: 59, text: snall - -BorrowedBox(0x5649027f6400), confidence: 89, text: then - -BorrowedBox(0x5649027f6420), confidence: 55, text: De - -BorrowedBox(0x5649027f4d00), confidence: 78, text: N - -BorrowedBox(0x5649027f4d20), confidence: 87, text: rebellion - -BorrowedBox(0x5649027f4d40), confidence: 82, text: against - -BorrowedBox(0x5649027f4d60), confidence: 89, text: the - -BorrowedBox(0x5649027f4d80), confidence: 40, text: Lniteao - -BorrowedBox(0x5649027f4da0), confidence: 40, text: States. - -BorrowedBox(0x5649027f4dc0), confidence: 76, text: snall - -BorrowedBox(0x5649027f4de0), confidence: 2, text: De - -BorrowedBox(0x5649027f4e00), confidence: 87, text: then, - -BorrowedBox(0x5649027f4e20), confidence: 37, text: thenceforwaraq. - -BorrowedBox(0x5649038da7c0), confidence: 0, text: N0 - -BorrowedBox(0x5649038da7e0), confidence: 87, text: forever - -BorrowedBox(0x5649038da800), confidence: 61, text: free: - -BorrowedBox(0x5649038da820), confidence: 54, text: =1ale - -BorrowedBox(0x5649038da840), confidence: 83, text: the - -BorrowedBox(0x5649038da860), confidence: 10, text: = xecutive - -BorrowedBox(0x5649038da880), confidence: 85, text: (sovernment - -BorrowedBox(0x5649038da8a0), confidence: 87, text: Of - -BorrowedBox(0x5649038da8c0), confidence: 88, text: the - -BorrowedBox(0x5649038da8e0), confidence: 33, text: Lniteao - -BorrowedBox(0x5649027f6180), confidence: 90, text: States, - -BorrowedBox(0x5649027f61a0), confidence: 0, text: including - -BorrowedBox(0x5649027f61c0), confidence: 89, text: the - -BorrowedBox(0x5649027f61e0), confidence: 88, text: military - -BorrowedBox(0x5649027f6200), confidence: 58, text: ana - -BorrowedBox(0x5649027f6220), confidence: 30, text: Nnaval - -BorrowedBox(0x5649027f6240), confidence: 73, text: authority - -BorrowedBox(0x5649027f6260), confidence: 57, text: thereof, - -BorrowedBox(0x5649027f6280), confidence: 25, text: will - -BorrowedBox(0x5649027f62a0), confidence: 90, text: recognize - -BorrowedBox(0x5649038cef00), confidence: 54, text: =1ale - -BorrowedBox(0x5649038cef20), confidence: 58, text: maintain - -BorrowedBox(0x5649038cef40), confidence: 79, text: the - -BorrowedBox(0x5649038cef60), confidence: 80, text: freedom - -BorrowedBox(0x5649038cef80), confidence: 89, text: Of - -BorrowedBox(0x5649038cefa0), confidence: 36, text: sSuch - -BorrowedBox(0x5649038cefc0), confidence: 5, text: PEISOnS, - -BorrowedBox(0x5649038cefe0), confidence: 54, text: =1ale - -BorrowedBox(0x5649038cf000), confidence: 28, text: Wil - -BorrowedBox(0x5649038cf020), confidence: 79, text: 00 - -BorrowedBox(0x5649038cf040), confidence: 77, text: 10 - -BorrowedBox(0x5649015589d0), confidence: 20, text: aACT - -BorrowedBox(0x5649015589f0), confidence: 58, text: OFr - -BorrowedBox(0x564901558a10), confidence: 38, text: aC1S - -BorrowedBox(0x564901558a30), confidence: 81, text: 10 - -BorrowedBox(0x564901558a50), confidence: 27, text: [EPIresSS - -BorrowedBox(0x564901558a70), confidence: 36, text: sSuch - -BorrowedBox(0x564901558a90), confidence: 0, text: PEISOnS, - -BorrowedBox(0x564901558ab0), confidence: 75, text: OFr - -BorrowedBox(0x564901558ad0), confidence: 85, text: ally - -BorrowedBox(0x564901558af0), confidence: 87, text: Of - -BorrowedBox(0x564901558b10), confidence: 45, text: them., - -BorrowedBox(0x564903961530), confidence: 78, text: N - -BorrowedBox(0x564903961550), confidence: 74, text: dlly - -BorrowedBox(0x564903961570), confidence: 75, text: efforts - -BorrowedBox(0x564903961590), confidence: 90, text: they - -BorrowedBox(0x5649039615b0), confidence: 41, text: [T1dy - -BorrowedBox(0x5649039615d0), confidence: 86, text: make - -BorrowedBox(0x5649039615f0), confidence: 91, text: for - -BorrowedBox(0x564903961610), confidence: 69, text: thelr - -BorrowedBox(0x564903961630), confidence: 86, text: actual - -BorrowedBox(0x564903961650), confidence: 18, text: freeadom. - -BorrowedBox(0x564903961670), confidence: 60, text: T hat - -BorrowedBox(0x56490390b0c0), confidence: 20, text: tho - -BorrowedBox(0x56490390b0e0), confidence: 10, text: —vocritive - -BorrowedBox(0x56490390b100), confidence: 53, text: will - -BorrowedBox(0x56490390b120), confidence: 42, text: (YY) - -BorrowedBox(0x56490390b140), confidence: 16, text: tho - -BorrowedBox(0x56490390b160), confidence: 57, text: firet - -BorrowedBox(0x56490390b180), confidence: 21, text: N \/ - -BorrowedBox(0x56490390b1a0), confidence: 69, text: N1 - -BorrowedBox(0x56490390b1c0), confidence: 15, text: 1aniiarv - -BorrowedBox(0x56490390b1e0), confidence: 51, text: atoracain - -BorrowedBox(0x56490390b200), confidence: 64, text: N\/ - diff --git a/src/main.rs b/src/main.rs index 79dac02..41898e1 100644 --- a/src/main.rs +++ b/src/main.rs @@ -1,12 +1,22 @@ extern crate leptess; +use ocr_json_common::TextBox; use leptess::{leptonica, tesseract}; +use std::env; use std::path::Path; -fn main() { - let mut api = tesseract::TessApi::new(None, "eng").unwrap(); +/* TODO: preprox here */ - let pix = leptonica::pix_read(Path::new("./test.png")).unwrap(); +fn main() { + let mut ocr_rects = Vec::new(); + let file_name = if env::args().count() == 2 { + env::args().nth(1).unwrap() + } else { + panic!("Please enter a target file path") + }; + let image_path = Path::new(&file_name); + let mut api = tesseract::TessApi::new(Some("/usr/share/tessdata/"), "eng").unwrap(); + let pix = leptonica::pix_read(image_path).unwrap(); api.set_image(&pix); // detect bounding boxes for words @@ -14,13 +24,30 @@ fn main() { .get_component_images(leptess::capi::TessPageIteratorLevel_RIL_WORD, true) .unwrap(); - println!("Found {} textline image components.", boxes.get_n()); - + let mut boxid = 0; // run OCR on each word bounding box for b in &boxes { api.set_rectangle(&b); let text = api.get_utf8_text().unwrap(); let confi = api.mean_text_conf(); - println!("{:?}, confidence: {}, text: {}", b, confi, text); + let bref = b.as_ref(); + /* + println!( + "[X: {}, Y: {}, W: {}, H: {}]: confidence: {}, text: {}", + bref.x, bref.y, bref.w, bref.h, confi, text + );*/ + ocr_rects.push(TextBox { + id: boxid, + hint: text, + confidence: confi as u32, + x: bref.x, + y: bref.y, + height: bref.h as u32, + width: bref.w as u32, + }); + boxid += 1; } + + let json = serde_json::to_string(&ocr_rects).unwrap(); + println!("{}", json); } diff --git a/test.png b/test.png deleted file mode 100644 index b7b8fc9..0000000 Binary files a/test.png and /dev/null differ