From a1d1f38a01978d7a7407552f6f948ac0b712f438 Mon Sep 17 00:00:00 2001
From: "google-labs-jules[bot]"
<161369871+google-labs-jules[bot]@users.noreply.github.com>
Date: Mon, 5 Jan 2026 07:07:05 +0000
Subject: [PATCH] Refactor passage HTML parsing for Telegram compatibility
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Updated `ScriptureBot` to perform server-side HTML preprocessing for Telegram messages.
- Replaced `FilterTree` logic in `pkg/app/passage.go` with a recursive `ParseNodesForPassage` function that walks the entire HTML tree.
- Implemented robust HTML tag conversion:
- `
`-`` -> `` (bold with newlines)
- ``/`- ` -> Bullet points (`•`)
- `div` -> Recurse children (stripping tag)
- `b`/`strong` -> ``
- `i`/`em` -> ``
- `sup` -> Unicode superscripts (via `platform.TelegramSuperscript`)
- Added explicit HTML escaping for all text content to prevent injection or broken tags.
- Configured `env.Res.ParseMode = "HTML"` for `GetBiblePassage` responses.
- Updated unit tests in `pkg/app/passage_test.go` to verify new HTML output formats and correct handling of lists, headers, and superscripts.
---
pkg/app/passage.go | 95 +++++++++++++++++++++++------------------
pkg/app/passage_test.go | 52 ++++++++++++++++------
2 files changed, 93 insertions(+), 54 deletions(-)
diff --git a/pkg/app/passage.go b/pkg/app/passage.go
index 5a5a7d3..2c610bc 100644
--- a/pkg/app/passage.go
+++ b/pkg/app/passage.go
@@ -8,6 +8,7 @@ import (
"log"
"net/url"
"strings"
+ stdhtml "html"
"golang.org/x/net/html"
@@ -59,20 +60,30 @@ func isNextSiblingBr(node *html.Node) bool {
}
func ParseNodesForPassage(node *html.Node) string {
- var text string
var parts []string
for child := node.FirstChild; child != nil; child = child.NextSibling {
- parts = append(parts, text)
+ // Filter out footnotes sections/cross-refs if they appear as divs
+ if child.Type == html.ElementNode {
+ for _, attr := range child.Attr {
+ if attr.Key == "class" {
+ if strings.Contains(attr.Val, "footnotes") || strings.Contains(attr.Val, "cross-refs") {
+ continue
+ }
+ }
+ }
+ }
switch tag := child.Data; tag {
case "span":
+ // Keep existing logic for span (likely poetry lines in legacy/scraped HTML)
childText := ParseNodesForPassage(child)
parts = append(parts, childText)
if len(strings.TrimSpace(childText)) > 0 && !isNextSiblingBr(child) {
parts = append(parts, "\n")
}
case "sup":
+ // Handle superscripts (verse numbers/footnotes)
isFootnote := func(node *html.Node) bool {
for _, attr := range node.Attr {
if attr.Key == "class" && attr.Val == "footnote" {
@@ -85,67 +96,62 @@ func ParseNodesForPassage(node *html.Node) string {
break
}
childText := ParseNodesForPassage(child)
+ // Use TelegramSuperscript for unicode conversion
if len(childText) > 0 {
- parts = append(parts, fmt.Sprintf("^%s^", childText))
+ parts = append(parts, platform.TelegramSuperscript(childText))
}
break
case "p":
parts = append(parts, ParseNodesForPassage(child))
- break
- case "b":
- parts = append(parts, platform.TelegramBold(ParseNodesForPassage(child)))
- case "i":
- parts = append(parts, platform.TelegramItalics(ParseNodesForPassage(child)))
- break
+ parts = append(parts, "\n\n")
+ case "b", "strong":
+ parts = append(parts, fmt.Sprintf("%s", ParseNodesForPassage(child)))
+ case "i", "em":
+ parts = append(parts, fmt.Sprintf("%s", ParseNodesForPassage(child)))
+ case "h1", "h2", "h3", "h4", "h5", "h6":
+ // Ignore "Footnotes" or "Cross references" headers
+ headerText := ParseNodesForPassage(child)
+ if headerText == "Footnotes" || headerText == "Cross references" {
+ continue
+ }
+ parts = append(parts, fmt.Sprintf("\n\n%s\n", headerText))
+ case "ul", "ol":
+ parts = append(parts, ParseNodesForPassage(child))
+ case "li":
+ parts = append(parts, fmt.Sprintf("• %s\n", ParseNodesForPassage(child)))
case "br":
parts = append(parts, "\n")
- break
+ case "div":
+ parts = append(parts, ParseNodesForPassage(child))
default:
- parts = append(parts, child.Data)
+ if child.Type == html.TextNode {
+ parts = append(parts, stdhtml.EscapeString(child.Data))
+ } else if child.Type == html.ElementNode {
+ // Recurse for unknown elements to preserve content
+ parts = append(parts, ParseNodesForPassage(child))
+ }
}
}
- text = strings.Join(parts, "")
-
- if node.Data == "h1" || node.Data == "h2" || node.Data == "h3" || node.Data == "h4" {
- text = fmt.Sprintf("*%s*", text)
- }
- return text
+ return strings.Join(parts, "")
}
func GetPassage(ref string, doc *html.Node, version string) string {
- filtNodes := utils.FilterTree(doc, func(child *html.Node) bool {
- switch tag := child.Data; tag {
- case "h1":
- fallthrough
- case "h2":
- fallthrough
- case "h3":
- fallthrough
- case "h4":
- if child.FirstChild.Data == "Footnotes" || child.FirstChild.Data == "Cross references" {
- return false
- }
- fallthrough
- case "p":
- return true
- }
- return false
- })
+ // Replaced FilterTree with direct parsing of the root node
+ // This allows handling arbitrary structure (divs, lists) returned by the API
- textBlocks := utils.MapNodeListToString(filtNodes, ParseNodesForPassage)
+ text := ParseNodesForPassage(doc)
var passage strings.Builder
if len(ref) > 0 {
- refString := fmt.Sprintf("_%s_ (%s)", ref, version)
+ // Use HTML formatting for reference
+ refString := fmt.Sprintf("%s (%s)", ref, version)
passage.WriteString(refString)
}
- for _, block := range textBlocks {
- passage.WriteString("\n")
- passage.WriteString(block)
- }
+ passage.WriteString("\n")
+ passage.WriteString(strings.TrimSpace(text))
return passage.String()
}
@@ -158,6 +164,11 @@ func ParsePassageFromHtml(ref string, rawHtml string, version string) string {
return rawHtml
}
+ // html.Parse returns a doc with html->body structure.
+ // GetPassage -> ParseNodesForPassage will traverse it.
+ // We might want to find 'body' to avoid processing 'head'?
+ // ParseNodesForPassage iterates children. doc->html->body.
+ // We can let it recurse.
return strings.TrimSpace(GetPassage(ref, doc, version))
}
@@ -181,6 +192,7 @@ func GetBiblePassageFallback(env def.SessionData) def.SessionData {
// Attempt to get the passage
env.Res.Message = GetPassage(ref, passageNode, config.Version)
+ env.Res.ParseMode = def.TELEGRAM_PARSE_MODE_HTML
return env
}
@@ -224,6 +236,7 @@ func GetBiblePassage(env def.SessionData) def.SessionData {
if len(resp.Verse) > 0 {
env.Res.Message = ParsePassageFromHtml(env.Msg.Message, resp.Verse, config.Version)
+ env.Res.ParseMode = def.TELEGRAM_PARSE_MODE_HTML
return env
}
}
diff --git a/pkg/app/passage_test.go b/pkg/app/passage_test.go
index e1dbc6b..29fdaeb 100644
--- a/pkg/app/passage_test.go
+++ b/pkg/app/passage_test.go
@@ -112,6 +112,10 @@ func TestGetBiblePassage(t *testing.T) {
if len(env.Res.Message) < 10 {
t.Errorf("Expected passage text, got '%s'", env.Res.Message)
}
+ // Verify ParseMode is set
+ if env.Res.ParseMode != "HTML" {
+ t.Errorf("Expected ParseMode 'HTML', got '%s'", env.Res.ParseMode)
+ }
})
t.Run("Empty", func(t *testing.T) {
@@ -166,21 +170,26 @@ func TestGetBiblePassage(t *testing.T) {
if !strings.Contains(env.Res.Message, "In the beginning") {
t.Errorf("Expected fallback passage content, got '%s'", env.Res.Message)
}
+ // Fallback should also use HTML mode
+ if env.Res.ParseMode != "HTML" {
+ t.Errorf("Expected ParseMode 'HTML' in fallback, got '%s'", env.Res.ParseMode)
+ }
})
}
func TestParsePassageFromHtml(t *testing.T) {
t.Run("Valid HTML with superscript", func(t *testing.T) {
html := `
12 But to all who did receive him, who believed in his name, he gave the right to become children of God,
`
- expected := `^12 ^But to all who did receive him, who believed in his name, he gave the right to become children of God,`
+ // Updated expectation: unicode superscripts and HTML formatting
+ expected := `¹²But to all who did receive him, who believed in his name, he gave the right to become children of God,`
if got := ParsePassageFromHtml("", html, ""); got != expected {
- t.Errorf("ParsePassageFromHtml() = %v, want %v", got, expected)
+ t.Errorf("ParsePassageFromHtml() = %s, want %s", got, expected)
}
})
t.Run("HTML with italics", func(t *testing.T) {
html := `This is italic.
`
- expected := `_This is italic._`
+ expected := `This is italic.`
if got := ParsePassageFromHtml("", html, ""); got != expected {
t.Errorf("ParsePassageFromHtml() = %v, want %v", got, expected)
}
@@ -188,7 +197,7 @@ func TestParsePassageFromHtml(t *testing.T) {
t.Run("HTML with bold", func(t *testing.T) {
html := `This is bold.
`
- expected := `*This is bold.*`
+ expected := `This is bold.`
if got := ParsePassageFromHtml("", html, ""); got != expected {
t.Errorf("ParsePassageFromHtml() = %v, want %v", got, expected)
}
@@ -228,21 +237,38 @@ func TestParsePassageFromHtml(t *testing.T) {
t.Run("Nested HTML tags", func(t *testing.T) {
html := `This is bold, and this is italic.
`
- expected := `*This is bold, _and this is italic._*`
+ expected := `This is bold, and this is italic.`
if got := ParsePassageFromHtml("", html, ""); got != expected {
t.Errorf("ParsePassageFromHtml() = %v, want %v", got, expected)
}
})
- t.Run("MarkdownV2 escaping", func(t *testing.T) {
- // Note: We no longer escape explicitly in ParsePassageFromHtml as we rely on the platform
- // to handle it later (via PostTelegram).
- // However, returning raw characters like * might cause issues if not handled by platform.
- // For now, we expect them to be returned raw.
- html := `This has special characters: *_. [hello](world)!
`
- expected := `This has special characters: *_. [hello](world)!`
+ t.Run("Lists", func(t *testing.T) {
+ html := ``
+ // Note: The ParseNodesForPassage appends newline after each Item.
+ // strings.TrimSpace removes the last newline.
+ // Item 1\nItem 2\n -> Item 1\nItem 2
+ expected := "• Item 1\n• Item 2"
if got := ParsePassageFromHtml("", html, ""); got != expected {
- t.Errorf("ParsePassageFromHtml() = %v, want %v", got, expected)
+ t.Errorf("ParsePassageFromHtml() = %q, want %q", got, expected)
+ }
+ })
+
+ t.Run("Headers", func(t *testing.T) {
+ html := `Header
`
+ // Code: \n\nHeader\n
+ // TrimSpace -> Header
+ expected := "Header"
+ if got := ParsePassageFromHtml("", html, ""); got != expected {
+ t.Errorf("ParsePassageFromHtml() = %q, want %q", got, expected)
+ }
+ })
+
+ t.Run("Divs and escaping", func(t *testing.T) {
+ html := `Text <with> symbols
`
+ expected := "Text <with> symbols"
+ if got := ParsePassageFromHtml("", html, ""); got != expected {
+ t.Errorf("ParsePassageFromHtml() = %q, want %q", got, expected)
}
})
}