-
Notifications
You must be signed in to change notification settings - Fork 259
ppm_utils.py: Add func 'verify_screen_with_gemini' #4294
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -36,11 +36,180 @@ | |
| except ImportError: | ||
| import md5 | ||
|
|
||
| try: | ||
| # Monkey patch importlib.metadata.packages_distributions for Python < 3.10 | ||
| # This is needed because some google libraries expect it to exist in stdlib importlib.metadata | ||
| # but it was only added in Python 3.10. | ||
| import sys | ||
| if sys.version_info < (3, 10): | ||
| import importlib.metadata | ||
| import importlib_metadata | ||
| if not hasattr(importlib.metadata, "packages_distributions"): | ||
| importlib.metadata.packages_distributions = importlib_metadata.packages_distributions | ||
| except ImportError: | ||
| pass | ||
|
|
||
| try: | ||
| import google.generativeai as genai | ||
| except ImportError: | ||
| genai = None | ||
| logging.getLogger("avocado.app").warning( | ||
| "google-generativeai library not found. Visual verification with Gemini is disabled." | ||
| ) | ||
|
|
||
| # Some directory/filename utils, for consistency | ||
|
|
||
| LOG = logging.getLogger("avocado." + __name__) | ||
|
|
||
|
|
||
| def verify_screen_with_gemini( | ||
| image_path, | ||
| prompt, | ||
| api_key=None, | ||
| model_name="gemini-pro-vision", # Use gemini-pro-vision for images | ||
| save_failed_image=True, | ||
| results_dir=None, | ||
| resize_max_dim=1024, | ||
| ): | ||
| """ | ||
| Verify screen content using Google Gemini API. | ||
| :param image_path: Path to the image file (PPM format expected from QEMU). | ||
| :param prompt: Question to ask about the image. | ||
| :param api_key: Gemini API Key. If None, uses GEMINI_API_KEY env var. | ||
| :param model_name: Model version to use. | ||
| :param save_failed_image: Whether to save the image if validation "fails" (logic depends on prompt). | ||
| :param results_dir: Directory to save failed images. | ||
| :param resize_max_dim: Max dimension to resize image to (maintains aspect ratio). | ||
| Set to None to disable resizing. | ||
| :return: The text response from Gemini (stripped). | ||
| """ | ||
| if not genai: | ||
| raise ImportError( | ||
| "google-generativeai library is required for this feature. " | ||
| "Please install it using 'pip install google-generativeai'." | ||
| ) | ||
|
|
||
| if not api_key: | ||
| api_key = os.environ.get("GEMINI_API_KEY") | ||
|
|
||
| if not api_key: | ||
| raise ValueError("Gemini API Key is required (set GEMINI_API_KEY env var).") | ||
|
|
||
| # Configure Proxy if set in environment | ||
| # requests/urllib3 usually pick up HTTPS_PROXY automatically, but we ensure it's available. | ||
| if "HTTPS_PROXY" not in os.environ and "https_proxy" not in os.environ: | ||
| LOG.warning("No HTTPS_PROXY set. Gemini API access might fail if you are behind a firewall.") | ||
|
|
||
| # Force REST transport to avoid gRPC proxy issues and ensure better compatibility | ||
| genai.configure(api_key=api_key, transport="rest") | ||
|
|
||
| if not Image: | ||
| raise ImportError("Pillow (PIL) is required to process images.") | ||
|
|
||
| try: | ||
| # Open and process image | ||
| with Image.open(image_path) as img: | ||
| # Resize if requested to save bandwidth/quota | ||
| if resize_max_dim: | ||
| img.thumbnail((resize_max_dim, resize_max_dim)) | ||
|
|
||
| # Convert to RGB (PPM is RGB, but good safety measure) and save to JPEG in memory | ||
| # JPEG is much smaller than PPM or PNG | ||
| import io | ||
|
|
||
| img_byte_arr = io.BytesIO() | ||
| img.convert("RGB").save(img_byte_arr, format="JPEG", quality=85) | ||
| img_jpeg = Image.open(img_byte_arr) | ||
|
|
||
| # Candidate models to try, in order of preference | ||
| candidate_models = [ | ||
| "gemini-flash-latest", # Available per logs | ||
| "gemini-2.0-flash", # Available per logs | ||
| "gemini-2.5-flash", # Available per logs | ||
| "gemini-pro-latest", # Available per logs | ||
| model_name, # The one passed in argument | ||
| "gemini-1.5-flash", | ||
| "gemini-1.5-pro", | ||
| "gemini-pro-vision", | ||
| ] | ||
| # Remove duplicates while preserving order | ||
| candidate_models = list(dict.fromkeys(candidate_models)) | ||
|
|
||
| response = None | ||
| last_error = None | ||
|
|
||
| for model_candidate in candidate_models: | ||
| try: | ||
| LOG.info("Trying Gemini model: %s", model_candidate) | ||
| model = genai.GenerativeModel(model_candidate) | ||
|
|
||
| # Retry logic for each model | ||
| max_retries = 2 | ||
| for attempt in range(max_retries): | ||
| try: | ||
| response = model.generate_content( | ||
| [prompt, img_jpeg], | ||
| generation_config=genai.types.GenerationConfig( | ||
| temperature=0.1 | ||
| ) | ||
| ) | ||
| break # Success inner loop | ||
| except Exception as e: | ||
| if "404" in str(e) or "not found" in str(e).lower(): | ||
| # Model not found, break inner retry to try next model | ||
| raise e | ||
| if attempt == max_retries - 1: | ||
| raise e | ||
| LOG.warning("Gemini API call failed (attempt %d/%d) for model %s: %s. Retrying...", attempt + 1, max_retries, model_candidate, e) | ||
| time.sleep(2) | ||
|
|
||
| if response: | ||
| break # Success outer loop | ||
|
|
||
| except Exception as e: | ||
| last_error = e | ||
| LOG.warning("Model %s failed: %s", model_candidate, e) | ||
| continue | ||
|
|
||
| if not response: | ||
| LOG.error("All candidate models failed. Listing available models...") | ||
| try: | ||
| for m in genai.list_models(): | ||
| LOG.info("Available model: %s (methods: %s)", m.name, m.supported_generation_methods) | ||
| except Exception as list_e: | ||
| LOG.error("Failed to list models: %s", list_e) | ||
|
|
||
| raise last_error or Exception("No working Gemini model found") | ||
|
|
||
| result_text = response.text.strip() | ||
|
|
||
| # Simple heuristic: if prompt asks for YES/NO and we get NO, save image | ||
| if save_failed_image and results_dir: | ||
| # This logic is loose; caller should decide pass/fail, but we help debug here. | ||
| # If the response starts with "No" (case insensitive), we treat it as suspicious. | ||
| if result_text.lower().startswith("no"): | ||
| try: | ||
| timestamp = time.strftime("%Y%m%d_%H%M%S") | ||
| fail_filename = "gemini_fail_%s.jpg" % timestamp | ||
| fail_path = os.path.join(results_dir, fail_filename) | ||
| if not os.path.exists(results_dir): | ||
| os.makedirs(results_dir) | ||
| # Save the compressed/resized version we actually sent | ||
| with open(fail_path, "wb") as f: | ||
| f.write(img_byte_arr.getvalue()) | ||
| LOG.info("Saved failed visual check image to: %s", fail_path) | ||
| except Exception as e: | ||
| LOG.error("Failed to save debug image: %s", e) | ||
|
Comment on lines
+188
to
+203
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Reconsider the failure detection logic. The logic that treats responses starting with "no" as failures is too simplistic and prompt-dependent. For example:
This heuristic will produce incorrect results depending on how the prompt is phrased. Consider these alternatives:
The current implementation may surprise users and lead to incorrect behavior. 🧰 Tools🪛 Ruff (0.14.10)202-202: Do not catch blind exception: (BLE001) 203-203: Use Replace with (TRY400) 🤖 Prompt for AI Agents |
||
|
|
||
| return result_text | ||
|
|
||
| except Exception as e: | ||
| LOG.error("Gemini API call failed: %s", e) | ||
| # We re-raise to let the test fail with ERROR status | ||
| raise | ||
|
|
||
|
|
||
| def _md5eval(data): | ||
| """ | ||
| Returns a md5 hash evaluator. This function is implemented in order to | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
🧩 Analysis chain
🏁 Script executed:
Repository: avocado-framework/avocado-vt
Length of output: 200
🌐 Web query:
latest stable version google-generativeai PyPI 2025💡 Result:
The latest stable PyPI release of google-generativeai is version 0.8.6 — uploaded Dec 16, 2025. [1][2]
Note: the package is marked legacy/deprecated with support ended Nov 30, 2025. [1]
Sources:
[1] PyPI project page for google-generativeai.
[2] Safety DB package entry for google-generativeai.
🌐 Web query:
latest stable version Pillow PyPI 2025💡 Result:
Latest stable Pillow on PyPI (as of Dec 30, 2025): 12.0.0 — released/uploaded Oct 15, 2025. [1]
Sources
[1] PyPI project page for Pillow (release files and upload date).
🌐 Web query:
google-generativeai deprecated EOL security November 2025💡 Result:
Short summary — relevant deprecations / EOL dates in Nov 2025:
If you want, I can fetch the full deprecation pages and list every affected model/feature and exact shutdown dates for your account/region.
🌐 Web query:
Pillow CVE security vulnerabilities 2025💡 Result:
Summary — Pillow CVE(s) in 2025
References
🌐 Web query:
google-generativeai CVE security vulnerabilities 2025💡 Result:
Here are notable CVEs and security bulletins (2025) related to Google’s generative-AI products (Vertex AI / Gemini) and adjacent Google AI incidents:
CVE-2025-11915 — Connection desynchronization (HTTP request/response smuggling) in Vertex AI model proxying; fixed/rolled out by Google (GCP-2025-059). Published/linked in Google Cloud security bulletins (published 2025-10-21; CVE entry in NVD). [1][2]
GCP-2025-059 (Vertex AI security bulletin) — streaming requests misrouting between recipients for certain third‑party and self‑deployed models; fixes rolled out Sept 26–28, 2025. (Google’s bulletin page lists the incident and CVE mapping). [1]
CVE-2025-6965 / related Google incident — media coverage reporting a patched memory‑corruption vuln (patched June 2025) where Google said an AI agent helped thwart exploitation; reported coverage (SecurityWeek). (Note: follow-up technical details limited in public reporting). [3]
Related research / disclosure discussions — public researcher posts and press coverage in 2025 raised issues with model input handling (prompt/ASCII smuggling and prompt‑injection risks affecting Gemini and integrated agents); Google’s position varied by report. See coverage/summaries. [4]
If you want, I can:
Sources: Google Cloud Vertex AI security bulletins (GCP-2025-059) [1]; NVD CVE entry for CVE‑2025‑11915 (NIST) [2]; SecurityWeek reporting on CVE‑2025‑6965 and Google AI agent use [3]; reporting on Gemini/input‑handling research [4].
Pin dependency versions and address end-of-life package.
The new dependencies lack version specifications. Additionally,
google-generativeaiis marked as deprecated/legacy with support ended November 30, 2025, and should not be used for new development. ForPillow, the suggested version range is outdated; version 12.0.0 is now stable, and versions 11.3.0+ include a critical security fix for CVE-2025-48379 (heap-based buffer overflow in DDS/BCn image handling).Either replace
google-generativeaiwith an alternative maintained package, or if it must be retained, document the EOL status and migration plan. ForPillow, pin to the latest stable release:(Remove
google-generativeaior provide justification for using an EOL package.)📝 Committable suggestion
🤖 Prompt for AI Agents