import sys import requests from PIL import Image from transformers import BlipProcessor, BlipForConditionalGeneration import logging import os logging.basicConfig(level=logging.INFO, format='%(message)s') def get_caption(img_path): model_path = "Salesforce/blip-image-captioning-base" # General cache directory cache_dir = os.path.join(os.path.expanduser("~"), ".cache/huggingface") # Rely on transformers library for caching logging.info("🚨 Loading BLIP model.") processor = BlipProcessor.from_pretrained(model_path, cache_dir=cache_dir) model = BlipForConditionalGeneration.from_pretrained(model_path, cache_dir=cache_dir) logging.info("🤗 Model loaded successfully.") img = Image.open(img_path) # unconditional image captioning inputs = processor(img, return_tensors="pt") out = model.generate(**inputs, max_new_tokens=100) caption = processor.decode(out[0], skip_special_tokens=True) print(caption) return caption if __name__ == "__main__": if len(sys.argv) > 1: img_path = sys.argv[1] # Get the image path from the command line argument get_caption(img_path) else: print("No image path provided.")