Skip to content

Commit

Permalink
Merge branch 'rl-0.14.0'
Browse files Browse the repository at this point in the history
  • Loading branch information
amenezes committed Feb 5, 2024
2 parents 3a67ea6 + b272632 commit 4fdd4e3
Show file tree
Hide file tree
Showing 9 changed files with 179 additions and 38 deletions.
2 changes: 1 addition & 1 deletion .tool-versions
Original file line number Diff line number Diff line change
@@ -1 +1 @@
python 3.12.0
python 3.12.1
126 changes: 109 additions & 17 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,67 +18,123 @@ pip install aiopytesseract

## Usage

```python
from pathlib import Path
### List all available languages by Tesseract installation

``` python
import aiopytesseract


# list all available languages by tesseract installation
await aiopytesseract.languages()
await aiopytesseract.get_languages()
```

### Tesseract version

``` python
import aiopytesseract

# tesseract version
await aiopytesseract.tesseract_version()
await aiopytesseract.get_tesseract_version()
```

### Tesseract parameters

``` python
import aiopytesseract

# tesseract parameters
await aiopytesseract.tesseract_parameters()
```

### Confidence only info

``` python
import aiopytesseract

# confidence only info
await aiopytesseract.confidence("tests/samples/file-sample_150kB.png")
```

### Deskew info

``` python
import aiopytesseract

# deskew info
await aiopytesseract.deskew("tests/samples/file-sample_150kB.png")
```

### Extract text from an image: locally or bytes

``` python
from pathlib import Path

import aiopytesseract

# extract text from an image: locally or bytes
await aiopytesseract.image_to_string("tests/samples/file-sample_150kB.png")
await aiopytesseract.image_to_string(
Path("tests/samples/file-sample_150kB.png")read_bytes(), dpi=220, lang='eng+por'
Path("tests/samples/file-sample_150kB.png").read_bytes(), dpi=220, lang='eng+por'
)
```

### Box estimates

``` python
from pathlib import Path

import aiopytesseract

# box estimates
await aiopytesseract.image_to_boxes("tests/samples/file-sample_150kB.png")
await aiopytesseract.image_to_boxes(Path("tests/samples/file-sample_150kB.png")
```

### Boxes, confidence and page numbers

``` python
from pathlib import Path

import aiopytesseract

# boxes, confidence and page numbers
await aiopytesseract.image_to_data("tests/samples/file-sample_150kB.png")
await aiopytesseract.image_to_data(Path("tests/samples/file-sample_150kB.png")
```

### Information about orientation and script detection

``` python
from pathlib import Path

import aiopytesseract

# information about orientation and script detection
await aiopytesseract.image_to_osd("tests/samples/file-sample_150kB.png")
await aiopytesseract.image_to_osd(Path("tests/samples/file-sample_150kB.png")
```

### Generate a searchable PDF

``` python
from pathlib import Path

import aiopytesseract

# generate a searchable PDF
await aiopytesseract.image_to_pdf("tests/samples/file-sample_150kB.png")
await aiopytesseract.image_to_pdf(Path("tests/samples/file-sample_150kB.png")
```

### Generate HOCR output

``` python
from pathlib import Path

import aiopytesseract

# generate HOCR output
await aiopytesseract.image_to_hocr("tests/samples/file-sample_150kB.png")
await aiopytesseract.image_to_hocr(Path("tests/samples/file-sample_150kB.png")
```

### Multi ouput

``` python
from pathlib import Path

import aiopytesseract

# multi ouput
async with aiopytesseract.run(
Path('tests/samples/file-sample_150kB.png').read_bytes(),
'output',
Expand All @@ -89,7 +145,43 @@ async with aiopytesseract.run(
alto_file, tsv_file, txt_file = resp
```

For more details on Tesseract best practices and the aiopytesseract, see the folder: `docs`.
### Config variables

``` python
from pathlib import Path

import aiopytesseract

async with aiopytesseract.run(
Path('tests/samples/text-with-chars-and-numbers.png').read_bytes(),
'output',
'alto tsv txt'
config=[("tessedit_char_whitelist", "0123456789")]
) as resp:
# will generate (output.xml, output.tsv and output.txt)
print(resp)
alto_file, tsv_file, txt_file = resp
```

``` python
from pathlib import Path

import aiopytesseract

await aiopytesseract.image_to_string(
"tests/samples/text-with-chars-and-numbers.png",
config=[("tessedit_char_whitelist", "0123456789")]
)

await aiopytesseract.image_to_string(
Path("tests/samples/text-with-chars-and-numbers.png").read_bytes(),
dpi=220,
lang='eng+por',
config=[("tessedit_char_whitelist", "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ")]
)
```

> For more details on Tesseract best practices and the aiopytesseract, see the folder: `docs`.

## Examples

Expand Down
2 changes: 1 addition & 1 deletion aiopytesseract/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
)
from .models import OSD, Box, Data, Parameter

__version__ = "0.13.0"
__version__ = "0.14.0"
__all__ = [
"__version__",
"OSD",
Expand Down
20 changes: 18 additions & 2 deletions aiopytesseract/base_command.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,8 @@ async def execute(
user_words: Union[None, str] = None,
user_patterns: Union[None, str] = None,
tessdata_dir: Union[None, str] = None,
config: Union[None, List[Tuple[str, str]]] = None,
encoding: str = AIOPYTESSERACT_DEFAULT_ENCODING,
) -> bytes:
raise NotImplementedError

Expand All @@ -67,6 +69,8 @@ async def _(
user_words: Union[None, str] = None,
user_patterns: Union[None, str] = None,
tessdata_dir: Union[None, str] = None,
config: Union[None, List[Tuple[str, str]]] = None,
encoding: str = AIOPYTESSERACT_DEFAULT_ENCODING,
) -> bytes:
await file_exists(image)
response: bytes = await execute(
Expand All @@ -80,6 +84,8 @@ async def _(
user_words=user_words,
user_patterns=user_patterns,
tessdata_dir=tessdata_dir,
config=config,
encoding=encoding,
)
return response

Expand All @@ -89,24 +95,26 @@ async def _(
image: bytes,
output_format: str,
dpi: int,
lang: Union[None, str],
psm: int,
oem: int,
timeout: float,
lang: Union[None, str] = None,
user_words: Union[None, str] = None,
user_patterns: Union[None, str] = None,
tessdata_dir: Union[None, str] = None,
config: Union[None, List[Tuple[str, str]]] = None,
encoding: str = AIOPYTESSERACT_DEFAULT_ENCODING,
) -> bytes:
cmd_args = await _build_cmd_args(
output_extension=output_format,
dpi=dpi,
psm=psm,
oem=oem,
lang=lang,
user_words=user_words,
user_patterns=user_patterns,
tessdata_dir=tessdata_dir,
lang=lang,
config=config,
)
try:
proc = await asyncio.wait_for(
Expand Down Expand Up @@ -142,6 +150,7 @@ async def execute_multi_output_cmd(
user_words: Union[None, str] = None,
user_patterns: Union[None, str] = None,
tessdata_dir: Union[None, str] = None,
config: Union[None, List[Tuple[str, str]]] = None,
encoding: str = AIOPYTESSERACT_DEFAULT_ENCODING,
) -> Tuple[str, ...]:
cmd_args = await _build_cmd_args(
Expand All @@ -154,6 +163,7 @@ async def execute_multi_output_cmd(
tessdata_dir=tessdata_dir,
lang=lang,
output=output_file,
config=config,
)
try:
proc = await asyncio.wait_for(
Expand Down Expand Up @@ -187,6 +197,7 @@ async def _build_cmd_args(
tessdata_dir: Union[None, str] = None,
lang: Union[None, str] = None,
output: str = "stdout",
config: Union[None, List[Tuple[str, str]]] = None,
) -> List[str]:
await asyncio.gather(psm_is_valid(psm), oem_is_valid(oem))
# OCR options must occur before any configfile.
Expand All @@ -212,6 +223,11 @@ async def _build_cmd_args(
cmd_args.append("-l")
cmd_args.append(lang)

if config:
for option, value in config:
cmd_args.append("-c")
cmd_args.append(f"{option}={value}")

extension = reversed(output_extension.split())
for ext in extension:
cmd_args.append(ext)
Expand Down
Loading

0 comments on commit 4fdd4e3

Please sign in to comment.