From a1d1f38a01978d7a7407552f6f948ac0b712f438 Mon Sep 17 00:00:00 2001
From: "google-labs-jules[bot]"
 <161369871+google-labs-jules[bot]@users.noreply.github.com>
Date: Mon, 5 Jan 2026 07:07:05 +0000
Subject: [PATCH] Refactor passage HTML parsing for Telegram compatibility
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Updated `ScriptureBot` to perform server-side HTML preprocessing for Telegram messages.
- Replaced `FilterTree` logic in `pkg/app/passage.go` with a recursive `ParseNodesForPassage` function that walks the entire HTML tree.
- Implemented robust HTML tag conversion:
  - `<h1>`-`<h6>` -> `<b>` (bold with newlines)
  - `<ul>`/`<li>` -> Bullet points (`•`)
  - `div` -> Recurse children (stripping tag)
  - `b`/`strong` -> `<b>`
  - `i`/`em` -> `<i>`
  - `sup` -> Unicode superscripts (via `platform.TelegramSuperscript`)
- Added explicit HTML escaping for all text content to prevent injection or broken tags.
- Configured `env.Res.ParseMode = "HTML"` for `GetBiblePassage` responses.
- Updated unit tests in `pkg/app/passage_test.go` to verify new HTML output formats and correct handling of lists, headers, and superscripts.
---
 pkg/app/passage.go      | 95 +++++++++++++++++++++++------------------
 pkg/app/passage_test.go | 52 ++++++++++++++++------
 2 files changed, 93 insertions(+), 54 deletions(-)
diff --git a/pkg/app/passage.go b/pkg/app/passage.go
index 5a5a7d3..2c610bc 100644
--- a/pkg/app/passage.go
+++ b/pkg/app/passage.go
@@ -8,6 +8,7 @@ import (
 	"log"
 	"net/url"
 	"strings"
+	stdhtml "html"
 
 	"golang.org/x/net/html"
 
@@ -59,20 +60,30 @@ func isNextSiblingBr(node *html.Node) bool {
 }
 
 func ParseNodesForPassage(node *html.Node) string {
-	var text string
 	var parts []string
 
 	for child := node.FirstChild; child != nil; child = child.NextSibling {
-		parts = append(parts, text)
+		// Filter out footnotes sections/cross-refs if they appear as divs
+		if child.Type == html.ElementNode {
+			for _, attr := range child.Attr {
+				if attr.Key == "class" {
+					if strings.Contains(attr.Val, "footnotes") || strings.Contains(attr.Val, "cross-refs") {
+						continue
+					}
+				}
+			}
+		}
 
 		switch tag := child.Data; tag {
 		case "span":
+			// Keep existing logic for span (likely poetry lines in legacy/scraped HTML)
 			childText := ParseNodesForPassage(child)
 			parts = append(parts, childText)
 			if len(strings.TrimSpace(childText)) > 0 && !isNextSiblingBr(child) {
 				parts = append(parts, "\n")
 			}
 		case "sup":
+			// Handle superscripts (verse numbers/footnotes)
 			isFootnote := func(node *html.Node) bool {
 				for _, attr := range node.Attr {
 					if attr.Key == "class" && attr.Val == "footnote" {
@@ -85,67 +96,62 @@ func ParseNodesForPassage(node *html.Node) string {
 				break
 			}
 			childText := ParseNodesForPassage(child)
+			// Use TelegramSuperscript for unicode conversion
 			if len(childText) > 0 {
-				parts = append(parts, fmt.Sprintf("^%s^", childText))
+				parts = append(parts, platform.TelegramSuperscript(childText))
 			}
 			break
 		case "p":
 			parts = append(parts, ParseNodesForPassage(child))
-			break
-		case "b":
-			parts = append(parts, platform.TelegramBold(ParseNodesForPassage(child)))
-		case "i":
-			parts = append(parts, platform.TelegramItalics(ParseNodesForPassage(child)))
-			break
+			parts = append(parts, "\n\n")
+		case "b", "strong":
+			parts = append(parts, fmt.Sprintf("<b>%s</b>", ParseNodesForPassage(child)))
+		case "i", "em":
+			parts = append(parts, fmt.Sprintf("<i>%s</i>", ParseNodesForPassage(child)))
+		case "h1", "h2", "h3", "h4", "h5", "h6":
+			// Ignore "Footnotes" or "Cross references" headers
+			headerText := ParseNodesForPassage(child)
+			if headerText == "Footnotes" || headerText == "Cross references" {
+				continue
+			}
+			parts = append(parts, fmt.Sprintf("\n\n<b>%s</b>\n", headerText))
+		case "ul", "ol":
+			parts = append(parts, ParseNodesForPassage(child))
+		case "li":
+			parts = append(parts, fmt.Sprintf("• %s\n", ParseNodesForPassage(child)))
 		case "br":
 			parts = append(parts, "\n")
-			break
+		case "div":
+			parts = append(parts, ParseNodesForPassage(child))
 		default:
-			parts = append(parts, child.Data)
+			if child.Type == html.TextNode {
+				parts = append(parts, stdhtml.EscapeString(child.Data))
+			} else if child.Type == html.ElementNode {
+				// Recurse for unknown elements to preserve content
+				parts = append(parts, ParseNodesForPassage(child))
+			}
 		}
 	}
 
-	text = strings.Join(parts, "")
-
-	if node.Data == "h1" || node.Data == "h2" || node.Data == "h3" || node.Data == "h4" {
-		text = fmt.Sprintf("*%s*", text)
-	}
-	return text
+	return strings.Join(parts, "")
 }
 
 func GetPassage(ref string, doc *html.Node, version string) string {
-	filtNodes := utils.FilterTree(doc, func(child *html.Node) bool {
-		switch tag := child.Data; tag {
-		case "h1":
-			fallthrough
-		case "h2":
-			fallthrough
-		case "h3":
-			fallthrough
-		case "h4":
-			if child.FirstChild.Data == "Footnotes" || child.FirstChild.Data == "Cross references" {
-				return false
-			}
-			fallthrough
-		case "p":
-			return true
-		}
-		return false
-	})
+	// Replaced FilterTree with direct parsing of the root node
+	// This allows handling arbitrary structure (divs, lists) returned by the API
 
-	textBlocks := utils.MapNodeListToString(filtNodes, ParseNodesForPassage)
+	text := ParseNodesForPassage(doc)
 
 	var passage strings.Builder
 
 	if len(ref) > 0 {
-		refString := fmt.Sprintf("_%s_ (%s)", ref, version)
+		// Use HTML formatting for reference
+		refString := fmt.Sprintf("<i>%s</i> (%s)", ref, version)
 		passage.WriteString(refString)
 	}
 
-	for _, block := range textBlocks {
-		passage.WriteString("\n")
-		passage.WriteString(block)
-	}
+	passage.WriteString("\n")
+	passage.WriteString(strings.TrimSpace(text))
 
 	return passage.String()
 }
@@ -158,6 +164,11 @@ func ParsePassageFromHtml(ref string, rawHtml string, version string) string {
 		return rawHtml
 	}
 
+	// html.Parse returns a doc with html->body structure.
+	// GetPassage -> ParseNodesForPassage will traverse it.
+	// We might want to find 'body' to avoid processing 'head'?
+	// ParseNodesForPassage iterates children. doc->html->body.
+	// We can let it recurse.
 	return strings.TrimSpace(GetPassage(ref, doc, version))
 }
 
@@ -181,6 +192,7 @@ func GetBiblePassageFallback(env def.SessionData) def.SessionData {
 
 	// Attempt to get the passage
 	env.Res.Message = GetPassage(ref, passageNode, config.Version)
+	env.Res.ParseMode = def.TELEGRAM_PARSE_MODE_HTML
 
 	return env
 }
@@ -224,6 +236,7 @@ func GetBiblePassage(env def.SessionData) def.SessionData {
 
 			if len(resp.Verse) > 0 {
 				env.Res.Message = ParsePassageFromHtml(env.Msg.Message, resp.Verse, config.Version)
+				env.Res.ParseMode = def.TELEGRAM_PARSE_MODE_HTML
 				return env
 			}
 		}
diff --git a/pkg/app/passage_test.go b/pkg/app/passage_test.go
index e1dbc6b..29fdaeb 100644
--- a/pkg/app/passage_test.go
+++ b/pkg/app/passage_test.go
@@ -112,6 +112,10 @@ func TestGetBiblePassage(t *testing.T) {
 		if len(env.Res.Message) < 10 {
 			t.Errorf("Expected passage text, got '%s'", env.Res.Message)
 		}
+		// Verify ParseMode is set
+		if env.Res.ParseMode != "HTML" {
+			t.Errorf("Expected ParseMode 'HTML', got '%s'", env.Res.ParseMode)
+		}
 	})
 
 	t.Run("Empty", func(t *testing.T) {
@@ -166,21 +170,26 @@ func TestGetBiblePassage(t *testing.T) {
 		if !strings.Contains(env.Res.Message, "In the beginning") {
 			t.Errorf("Expected fallback passage content, got '%s'", env.Res.Message)
 		}
+		// Fallback should also use HTML mode
+		if env.Res.ParseMode != "HTML" {
+			t.Errorf("Expected ParseMode 'HTML' in fallback, got '%s'", env.Res.ParseMode)
+		}
 	})
 }
 
 func TestParsePassageFromHtml(t *testing.T) {
 	t.Run("Valid HTML with superscript", func(t *testing.T) {
 		html := `<p><span><sup>12 </sup>But to all who did receive him, who believed in his name, he gave the right to become children of God,</span></p>`
-		expected := `^12 ^But to all who did receive him, who believed in his name, he gave the right to become children of God,`
+		// Updated expectation: unicode superscripts and HTML formatting
+		expected := `¹²But to all who did receive him, who believed in his name, he gave the right to become children of God,`
 		if got := ParsePassageFromHtml("", html, ""); got != expected {
-			t.Errorf("ParsePassageFromHtml() = %v, want %v", got, expected)
+			t.Errorf("ParsePassageFromHtml() = %s, want %s", got, expected)
 		}
 	})
 
 	t.Run("HTML with italics", func(t *testing.T) {
 		html := `<p><i>This is italic.</i></p>`
-		expected := `_This is italic._`
+		expected := `<i>This is italic.</i>`
 		if got := ParsePassageFromHtml("", html, ""); got != expected {
 			t.Errorf("ParsePassageFromHtml() = %v, want %v", got, expected)
 		}
@@ -188,7 +197,7 @@ func TestParsePassageFromHtml(t *testing.T) {
 
 	t.Run("HTML with bold", func(t *testing.T) {
 		html := `<p><b>This is bold.</b></p>`
-		expected := `*This is bold.*`
+		expected := `<b>This is bold.</b>`
 		if got := ParsePassageFromHtml("", html, ""); got != expected {
 			t.Errorf("ParsePassageFromHtml() = %v, want %v", got, expected)
 		}
@@ -228,21 +237,38 @@ func TestParsePassageFromHtml(t *testing.T) {
 
 	t.Run("Nested HTML tags", func(t *testing.T) {
 		html := `<p><b>This is bold, <i>and this is italic.</i></b></p>`
-		expected := `*This is bold, _and this is italic._*`
+		expected := `<b>This is bold, <i>and this is italic.</i></b>`
 		if got := ParsePassageFromHtml("", html, ""); got != expected {
 			t.Errorf("ParsePassageFromHtml() = %v, want %v", got, expected)
 		}
 	})
 
-	t.Run("MarkdownV2 escaping", func(t *testing.T) {
-		// Note: We no longer escape explicitly in ParsePassageFromHtml as we rely on the platform
-		// to handle it later (via PostTelegram).
-		// However, returning raw characters like * might cause issues if not handled by platform.
-		// For now, we expect them to be returned raw.
-		html := `<p>This has special characters: *_. [hello](world)!</p>`
-		expected := `This has special characters: *_. [hello](world)!`
+	t.Run("Lists", func(t *testing.T) {
+		html := `<ul><li>Item 1</li><li>Item 2</li></ul>`
+		// Note: The ParseNodesForPassage appends newline after each Item.
+		// strings.TrimSpace removes the last newline.
+		// Item 1\nItem 2\n -> Item 1\nItem 2
+		expected := "• Item 1\n• Item 2"
 		if got := ParsePassageFromHtml("", html, ""); got != expected {
-			t.Errorf("ParsePassageFromHtml() = %v, want %v", got, expected)
+			t.Errorf("ParsePassageFromHtml() = %q, want %q", got, expected)
+		}
+	})
+
+	t.Run("Headers", func(t *testing.T) {
+		html := `<h1>Header</h1>`
+		// Code: \n\n<b>Header</b>\n
+		// TrimSpace -> <b>Header</b>
+		expected := "<b>Header</b>"
+		if got := ParsePassageFromHtml("", html, ""); got != expected {
+			t.Errorf("ParsePassageFromHtml() = %q, want %q", got, expected)
+		}
+	})
+
+	t.Run("Divs and escaping", func(t *testing.T) {
+		html := `<div>Text &lt;with&gt; symbols</div>`
+		expected := "Text &lt;with&gt; symbols"
+		if got := ParsePassageFromHtml("", html, ""); got != expected {
+			t.Errorf("ParsePassageFromHtml() = %q, want %q", got, expected)
 		}
 	})
 }