machine_archivist/archive_pipeline.py

433 lines
18 KiB
Python
Raw Permalink Normal View History

2025-04-06 19:58:31 +02:00
# This script captures an image from the webcam, extracts' text using Tesseract or EasyOCR,
# generates a description using Ollama Vision, and uploads the text and image to Logseq.
# It also detects markers in the image using SIFT feature matching.
# The extracted text is checked for readability using textstat, and only a meaningful text is uploaded to Logseq.
# The original image is uploaded to Logseq as an asset.
# The annotated image is saved in the assets folder of the Logseq directory.
import locale
import urllib
import pytesseract
import easyocr
import ollama
import textstat
from logseq_api import Logseq, add_to_last_block_as_child
from ollama_ocr import OCRProcessor
from marker_detect import *
from camera_handler import scan
from image_analysis_pyobjc import recognize_text_pyobjc
import argparse
from datetime import datetime
from logger_config import logger
from pinry_handler import upload_and_create_pin
import time
import sys
from dotenv import load_dotenv
from success_message import show_image_with_message
import cv2
from PIL import Image
# Initialize OCR processors
MODEL_NAME = 'llama3.2-vision:11b'
ocr_processor = OCRProcessor(model_name=MODEL_NAME)
# Set locale to German
locale.setlocale(locale.LC_TIME, 'de_DE.UTF-8')
def get_image_description_ollama(image_path):
"""Uses an Ollama Vision model to generate a description of the image."""
try:
res = ollama.chat(
model=MODEL_NAME,
messages=[{'role': 'user', 'content': 'Describe this image.', 'images': [image_path]}]
)
return res['message']['content']
except Exception as e:
logger.error(f"❌ Ollama vision failed: {e}")
return ""
def recognize_text_easyocr(image_path):
"""Uses EasyOCR to extract text from the image."""
try:
reader = easyocr.Reader(['en', 'de'], gpu=True)
results = reader.readtext(image_path, detail=0, paragraph=True)
return ' '.join(results)
except Exception as e:
logger.error(f"❌ EasyOCR failed: {e}")
return ""
def recognize_text_tesseract(image_path):
"""Uses Tesseract OCR for text recognition."""
try:
img = Image.open(image_path)
gray = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2GRAY)
return pytesseract.image_to_string(gray, config='--oem 1 --psm 6')
except Exception as e:
logger.error(f"❌ Tesseract OCR failed: {e}")
return ""
def is_meaningful_text(text):
"""Checks if extracted text is meaningful using readability metrics."""
if not text or len(text.split()) < 3:
return False
return textstat.flesch_reading_ease(text) > 40 or textstat.text_standard(text, float_output=True) > 30
def transfer_img_to_logseq(frame):
"""Saves an image and generates a timestamped path."""
try:
now = datetime.now()
timestamp = now.strftime('%Y-%m-%dT%H-%M-%S-%f')[:-2]
asset_path = f'/Users/aron/thesis/logseq/assets/scans/{timestamp}.jpg'
cv2.imwrite(asset_path, frame)
return asset_path
except Exception as e:
logger.error(f"❌ Image saving failed: {e}")
return None
def ollama_ocr(image_path):
"""Performs OCR using Ollama's Llama Vision model."""
try:
return ocr_processor.process_image(image_path=image_path,
format_type="markdown",
preprocess=True,
)
except Exception as e:
logger.error(f"❌ Ollama OCR failed: {e}")
return ""
def append_block_to_logseq_page(title, block_title, block_content, logseq):
"""Appends a block to a Logseq page."""
try:
logseq.Editor.appendBlockInPage(title, block_title)
add_to_last_block_as_child(title, block_content, logseq)
except Exception as e:
logger.error(f"❌ Logseq block append failed: {e}")
def upload_to_logseq(logseq,
title,
ocr_text,
caption,
image_path_list,
hashtags=[],
marker_list=None,
ollama_ocr_text=None):
"""Uploads extracted text and images to Logseq."""
try:
logseq.App.showMsg('Uploading to Logseq...')
logger.info(f" {image_path_list=}")
# insert markers as tags
marker_string = ""
# add the elems of hashtags to the marker_string
for hash in hashtags:
# strip the initial # from the hashtag
hash = hash[1:]
marker_list.append(hash)
if marker_list:
# append markers as tags to marker_string
marker_string = ', '.join(marker_list)
logseq.Editor.createPage(title, {'tags': f'{marker_string}, upload',
'public': 'true',
'author': 'Automated Archive',
'source': 'Camera',
'date': f'{datetime.now().strftime("%Y-%m-%d")}',
'timestamp': f'{datetime.now().strftime("%H:%M:%S")}',
})
# Commented out because the markers are now tags
# logseq.Editor.appendBlockInPage(title, "## Detected Markers:")
# add_to_last_block_as_child(title, marker_string, logseq)
else:
logseq.Editor.createPage(title, {'tags': 'upload',
'public': 'true',
'author': 'Automated Archive',
'source': 'Camera',
'date': f'{datetime.now().strftime("%Y-%m-%d")}',
'timestamp': f'{datetime.now().strftime("%H:%M:%S")}',
})
# logseq.Editor.appendBlockInPage(title, "## Detected Markers:")
# add_to_last_block_as_child(title, "No markers detected.", logseq)
# add classical ocr text
if ocr_text:
logseq.Editor.appendBlockInPage(title, "## OCR Text:")
add_to_last_block_as_child(title, ocr_text, logseq)
# add ollama ocr text
if ollama_ocr_text:
logseq.Editor.appendBlockInPage(title, "## Ollama OCR Text:")
add_to_last_block_as_child(title, ollama_ocr_text, logseq)
# upload images
if image_path_list:
logseq.Editor.appendBlockInPage(title, "## Scans:")
# convert to relative paths
relative_path_list = []
for path in image_path_list:
parts = path.split("assets/scans/", 1) # Split at "scans/"
if len(parts) > 1:
relative_path_list.append("./assets/scans/" + parts[1]) # Add "./" at the start
for i, rel_image_path in enumerate(relative_path_list):
block_property = 'annotated:: false' if i == 0 else 'annotated:: true'
logseq.Editor.appendBlockInPage(title, f"### Scan {i + 1}\n{block_property}")
add_to_last_block_as_child(title, f"![{rel_image_path}]({rel_image_path})", logseq)
# add extracted text from ollama
if caption:
logseq.Editor.appendBlockInPage(title, "Ollama Image Caption:")
add_to_last_block_as_child(title, caption, logseq)
logseq.App.showMsg('Upload complete!')
except Exception as e:
logger.error(f"❌ Logseq upload failed: {e}")
def str_to_bool(value):
"""Convert string values from .env to boolean."""
return value.lower() in ("true", "1", "yes")
def parse_arguments():
"""Parse and return command-line arguments."""
load_dotenv() # Load environment variables
# Load environment variables with fallbacks
default_camera = int(os.getenv("CAMERA_INDEX", 0))
default_upload = str_to_bool(os.getenv("UPLOAD", "True"))
default_ocr_processor = os.getenv("OCR_PROCESSOR", "pyobjc")
default_use_ollama_caption = str_to_bool(os.getenv("USE_OLLAMA_CAPTION", "False"))
default_use_ollama_ocr = str_to_bool(os.getenv("USE_OLLAMA_OCR", "False"))
default_check_markers = str_to_bool(os.getenv("CHECK_MARKERS", "True"))
default_upload_marked_image = str_to_bool(os.getenv("UPLOAD_MARKED_IMAGE", "False"))
parser = argparse.ArgumentParser(
description="Scan an image using a specific camera index or provide an image file.")
parser.add_argument("--camera", type=int, default=default_camera,
help=f"Set the camera index (integer value required) (default: {default_camera})")
parser.add_argument("--image", type=str, help="Optional image file path to use instead of capturing a new image")
parser.add_argument("--upload", action="store_true", default=default_upload,
help=f"Enable Logseq upload (default: {default_upload})")
parser.add_argument("--ocr-processor", choices=["easyocr", "tesseract", "pyobjc"],
default=default_ocr_processor,
help=f"Choose OCR processor (default: '{default_ocr_processor}')")
parser.add_argument("--use-ollama-caption", action="store_true", default=default_use_ollama_caption,
help=f"Use Ollama for image description (default: {default_use_ollama_caption})")
parser.add_argument("--use-ollama-ocr", action="store_true", default=default_use_ollama_ocr,
help=f"Use Ollama for OCR (default: {default_use_ollama_ocr})")
parser.add_argument("--check-markers", action="store_true", default=default_check_markers,
help=f"Check for markers in the image (default: {default_check_markers})")
parser.add_argument("--upload-marked-image", action="store_true", default=default_upload_marked_image,
help=f"Upload marked image if markers detected (default: {default_upload_marked_image})")
return parser.parse_args()
def log_configuration(args):
"""Log configuration settings."""
logger.info("🔧 Configuration:")
for key, value in vars(args).items():
logger.info(f" {key.capitalize()}: {value}")
logger.info("------------------------------------------------------\n")
def capture_image(camera_index):
"""Capture an image from the camera."""
snap_timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S-%f")[:-3]
image_path = f"./scans/snap_{snap_timestamp}.png"
frame, captured_image_path = scan(image_path=image_path, camera_index=camera_index, hold_preview=False,
preview_scale=0.5, timeout=30)
if frame is None or captured_image_path is None:
logger.error("❌ Image capture failed or timed out. Exiting...")
sys.exit(1)
return frame, captured_image_path
def extract_text(image_path, ocr_processor):
"""Extract text using the selected OCR processor."""
ocr_text = ""
hashtags = []
if ocr_processor == "easyocr":
ocr_text = recognize_text_easyocr(image_path)
elif ocr_processor == "tesseract":
ocr_text = recognize_text_tesseract(image_path)
elif ocr_processor == "pyobjc":
ocr_text, hashtags, barcode_info = recognize_text_pyobjc(image_path)
if barcode_info:
logger.info(f"🔍 Barcode/QR Code info: {barcode_info}")
ocr_text += f"\n\nBarcode/QR Code Info: {barcode_info}"
return ocr_text, hashtags
def generate_image_description(image_path, use_ollama_caption, use_ollama_ocr):
"""Generate image description and OCR text using Ollama if enabled."""
caption = get_image_description_ollama(image_path) if use_ollama_caption else ""
ollama_ocr_text = ollama_ocr(image_path) if use_ollama_ocr else ""
return caption, ollama_ocr_text
def detect_and_upload_markers(image_path, check_markers, upload_marked_image):
"""Detect markers in the image and upload marked image if enabled."""
marker_list = []
if check_markers:
markers, marked_image_path, keypoints_image_path = detect_markers(image_path, templates, min_area=500)
logger.info(f" {markers=}")
if markers:
logger.info(f"🎯 Detected markers: {markers}, #{len(markers)}")
marker_list.extend(markers)
if upload_marked_image:
marked_frame = cv2.imread(marked_image_path)
marked_asset_path, _ = transfer_img_to_logseq(marked_frame)
else:
logger.info("🛑 No markers detected.")
return marker_list, marked_image_path, keypoints_image_path
def upload_to_logseq_if_needed(upload,
ocr_text,
caption,
ollama_ocr_text,
frame,
marker_list,
marked_image_path,
hashtags=[]):
"""Upload to Logseq if required."""
if upload:
logger.info('🚀 Uploading to Logseq...')
logseq = Logseq(host="127.0.0.1", port=12315, token="vetinari")
asset_path_list = [transfer_img_to_logseq(frame)]
if marker_list:
marked_frame = cv2.imread(marked_image_path)
marked_asset_path = transfer_img_to_logseq(marked_frame)
asset_path_list.append(marked_asset_path)
pagetitle = f"Upload-{datetime.now().strftime('%Y-%m-%d_%H:%M:%S')}"
upload_to_logseq(logseq,
title=pagetitle,
ocr_text=ocr_text,
caption=caption,
image_path_list=asset_path_list,
marker_list=marker_list,
ollama_ocr_text=ollama_ocr_text,
hashtags=hashtags)
logger.info(f"✅ Upload complete.")
return pagetitle
return None
def main():
"""Main function to execute OCR workflow."""
try:
args = parse_arguments()
log_configuration(args)
start_time = time.time()
if args.image:
image_path = args.image
logger.info(f'📂 Using provided image file.: {image_path}')
if not os.path.exists(image_path):
logger.error(f"Error: Provided image file '{image_path}' does not exist.")
return
frame = cv2.imread(image_path)
if frame is None:
print(f"Error: Failed to load image from '{image_path}'")
return
else:
frame, image_path = capture_image(args.camera)
logger.info('🔍 Extracting text...')
ocr_text, hashtags = extract_text(image_path, args.ocr_processor)
# Check for meaningless text if OCR processor is easyocr or tesseract
if args.ocr_processor in ["easyocr", "tesseract"] and not is_meaningful_text(ocr_text):
logger.warning('❌ OCR output was not meaningful. Discarding...')
ocr_text = ""
logger.info(f"✅ Text extraction complete in {time.time() - start_time:.2f} seconds.\n")
logger.info('🖼️ Get image description...')
caption, ollama_ocr_text = generate_image_description(image_path, args.use_ollama_caption, args.use_ollama_ocr)
logger.info(f"✅ Image description generation complete in {time.time() - start_time:.2f} seconds.\n")
logger.info('🎯 Detecting markers...')
marker_list, marked_image_path, keypoints_image_path = detect_and_upload_markers(image_path,
args.check_markers,
args.upload_marked_image)
pagetitle = upload_to_logseq_if_needed(args.upload,
ocr_text,
caption,
ollama_ocr_text,
frame,
marker_list,
marked_image_path=marked_image_path,
hashtags=hashtags)
logseqpageurl = None
if pagetitle:
logger.info(f"✅ Upload complete. Page title: {pagetitle}")
# get the page URL from the title
baseurl = 'https://archive.petau.net/#/page/'
# make sure the pagetitle is URL-safe, a full URL looks like this
pagetitle_url = urllib.parse.quote(pagetitle, safe="/?#[]@&=")
logseqpageurl = f"{baseurl}{pagetitle_url}"
logger.info(f"Page URL: {logseqpageurl}")
pinry_tags = marker_list + hashtags
img_desc = caption if caption else None
# Call the wrapper function to upload and create the pin
pin_url = upload_and_create_pin(image_path, board_id=2, tags=pinry_tags, source_url=logseqpageurl,
description=img_desc)
if pin_url:
logger.info(f"✅ Pin URL: {pin_url}")
if pagetitle:
if args.upload:
logger.info('Adding Pin URL to Logseq...')
logseq = Logseq(host="127.0.0.1", port=12315, token="vetinari")
append_block_to_logseq_page(pagetitle, "## Pin URL:", f'[The Pinry URL for this upload]({pin_url})',
logseq)
else:
logger.error("❌ Pin creation failed.")
logger.info(f"🚀 Pipeline completed in {time.time() - start_time:.2f} seconds.")
logger.info("------------------------------------------------------\n")
success_message = "Scan Completed Successfully!"
url_to_open = "https://pinry.petau.net" # URL to open after timeout
# Show the image with the success message and open the URL after 5 seconds
show_image_with_message(keypoints_image_path, success_message, timeout=5, url=url_to_open)
except KeyboardInterrupt:
logger.info("\nGracefully shutting down... Interrupt received (Ctrl+C).")
sys.exit(0) # Exit gracefully with status code 0
if __name__ == "__main__":
main()