Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
95 changes: 54 additions & 41 deletions pkg/app/passage.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import (
"log"
"net/url"
"strings"
stdhtml "html"

"golang.org/x/net/html"

Expand Down Expand Up @@ -59,20 +60,30 @@ func isNextSiblingBr(node *html.Node) bool {
}

func ParseNodesForPassage(node *html.Node) string {
var text string
var parts []string

for child := node.FirstChild; child != nil; child = child.NextSibling {
parts = append(parts, text)
// Filter out footnotes sections/cross-refs if they appear as divs
if child.Type == html.ElementNode {
for _, attr := range child.Attr {
if attr.Key == "class" {
if strings.Contains(attr.Val, "footnotes") || strings.Contains(attr.Val, "cross-refs") {
continue
}
}
}
}

switch tag := child.Data; tag {
case "span":
// Keep existing logic for span (likely poetry lines in legacy/scraped HTML)
childText := ParseNodesForPassage(child)
parts = append(parts, childText)
if len(strings.TrimSpace(childText)) > 0 && !isNextSiblingBr(child) {
parts = append(parts, "\n")
}
case "sup":
// Handle superscripts (verse numbers/footnotes)
isFootnote := func(node *html.Node) bool {
for _, attr := range node.Attr {
if attr.Key == "class" && attr.Val == "footnote" {
Expand All @@ -85,67 +96,62 @@ func ParseNodesForPassage(node *html.Node) string {
break
}
childText := ParseNodesForPassage(child)
// Use TelegramSuperscript for unicode conversion
if len(childText) > 0 {
parts = append(parts, fmt.Sprintf("^%s^", childText))
parts = append(parts, platform.TelegramSuperscript(childText))
}
break
case "p":
parts = append(parts, ParseNodesForPassage(child))
break
case "b":
parts = append(parts, platform.TelegramBold(ParseNodesForPassage(child)))
case "i":
parts = append(parts, platform.TelegramItalics(ParseNodesForPassage(child)))
break
parts = append(parts, "\n\n")
case "b", "strong":
parts = append(parts, fmt.Sprintf("<b>%s</b>", ParseNodesForPassage(child)))
case "i", "em":
parts = append(parts, fmt.Sprintf("<i>%s</i>", ParseNodesForPassage(child)))
case "h1", "h2", "h3", "h4", "h5", "h6":
// Ignore "Footnotes" or "Cross references" headers
headerText := ParseNodesForPassage(child)
if headerText == "Footnotes" || headerText == "Cross references" {
continue
}
parts = append(parts, fmt.Sprintf("\n\n<b>%s</b>\n", headerText))
case "ul", "ol":
parts = append(parts, ParseNodesForPassage(child))
case "li":
parts = append(parts, fmt.Sprintf("• %s\n", ParseNodesForPassage(child)))
case "br":
parts = append(parts, "\n")
break
case "div":
parts = append(parts, ParseNodesForPassage(child))
default:
parts = append(parts, child.Data)
if child.Type == html.TextNode {
parts = append(parts, stdhtml.EscapeString(child.Data))
} else if child.Type == html.ElementNode {
// Recurse for unknown elements to preserve content
parts = append(parts, ParseNodesForPassage(child))
}
}
}

text = strings.Join(parts, "")

if node.Data == "h1" || node.Data == "h2" || node.Data == "h3" || node.Data == "h4" {
text = fmt.Sprintf("*%s*", text)
}
return text
return strings.Join(parts, "")
}

func GetPassage(ref string, doc *html.Node, version string) string {
filtNodes := utils.FilterTree(doc, func(child *html.Node) bool {
switch tag := child.Data; tag {
case "h1":
fallthrough
case "h2":
fallthrough
case "h3":
fallthrough
case "h4":
if child.FirstChild.Data == "Footnotes" || child.FirstChild.Data == "Cross references" {
return false
}
fallthrough
case "p":
return true
}
return false
})
// Replaced FilterTree with direct parsing of the root node
// This allows handling arbitrary structure (divs, lists) returned by the API

textBlocks := utils.MapNodeListToString(filtNodes, ParseNodesForPassage)
text := ParseNodesForPassage(doc)

var passage strings.Builder

if len(ref) > 0 {
refString := fmt.Sprintf("_%s_ (%s)", ref, version)
// Use HTML formatting for reference
refString := fmt.Sprintf("<i>%s</i> (%s)", ref, version)
passage.WriteString(refString)
}

for _, block := range textBlocks {
passage.WriteString("\n")
passage.WriteString(block)
}
passage.WriteString("\n")
passage.WriteString(strings.TrimSpace(text))

return passage.String()
}
Expand All @@ -158,6 +164,11 @@ func ParsePassageFromHtml(ref string, rawHtml string, version string) string {
return rawHtml
}

// html.Parse returns a doc with html->body structure.
// GetPassage -> ParseNodesForPassage will traverse it.
// We might want to find 'body' to avoid processing 'head'?
// ParseNodesForPassage iterates children. doc->html->body.
// We can let it recurse.
return strings.TrimSpace(GetPassage(ref, doc, version))
}

Expand All @@ -181,6 +192,7 @@ func GetBiblePassageFallback(env def.SessionData) def.SessionData {

// Attempt to get the passage
env.Res.Message = GetPassage(ref, passageNode, config.Version)
env.Res.ParseMode = def.TELEGRAM_PARSE_MODE_HTML

return env
}
Expand Down Expand Up @@ -224,6 +236,7 @@ func GetBiblePassage(env def.SessionData) def.SessionData {

if len(resp.Verse) > 0 {
env.Res.Message = ParsePassageFromHtml(env.Msg.Message, resp.Verse, config.Version)
env.Res.ParseMode = def.TELEGRAM_PARSE_MODE_HTML
return env
}
}
Expand Down
52 changes: 39 additions & 13 deletions pkg/app/passage_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,10 @@ func TestGetBiblePassage(t *testing.T) {
if len(env.Res.Message) < 10 {
t.Errorf("Expected passage text, got '%s'", env.Res.Message)
}
// Verify ParseMode is set
if env.Res.ParseMode != "HTML" {
t.Errorf("Expected ParseMode 'HTML', got '%s'", env.Res.ParseMode)
}
})

t.Run("Empty", func(t *testing.T) {
Expand Down Expand Up @@ -166,29 +170,34 @@ func TestGetBiblePassage(t *testing.T) {
if !strings.Contains(env.Res.Message, "In the beginning") {
t.Errorf("Expected fallback passage content, got '%s'", env.Res.Message)
}
// Fallback should also use HTML mode
if env.Res.ParseMode != "HTML" {
t.Errorf("Expected ParseMode 'HTML' in fallback, got '%s'", env.Res.ParseMode)
}
})
}

func TestParsePassageFromHtml(t *testing.T) {
t.Run("Valid HTML with superscript", func(t *testing.T) {
html := `<p><span><sup>12 </sup>But to all who did receive him, who believed in his name, he gave the right to become children of God,</span></p>`
expected := `^12 ^But to all who did receive him, who believed in his name, he gave the right to become children of God,`
// Updated expectation: unicode superscripts and HTML formatting
expected := `¹²But to all who did receive him, who believed in his name, he gave the right to become children of God,`
if got := ParsePassageFromHtml("", html, ""); got != expected {
t.Errorf("ParsePassageFromHtml() = %v, want %v", got, expected)
t.Errorf("ParsePassageFromHtml() = %s, want %s", got, expected)
}
})

t.Run("HTML with italics", func(t *testing.T) {
html := `<p><i>This is italic.</i></p>`
expected := `_This is italic._`
expected := `<i>This is italic.</i>`
if got := ParsePassageFromHtml("", html, ""); got != expected {
t.Errorf("ParsePassageFromHtml() = %v, want %v", got, expected)
}
})

t.Run("HTML with bold", func(t *testing.T) {
html := `<p><b>This is bold.</b></p>`
expected := `*This is bold.*`
expected := `<b>This is bold.</b>`
if got := ParsePassageFromHtml("", html, ""); got != expected {
t.Errorf("ParsePassageFromHtml() = %v, want %v", got, expected)
}
Expand Down Expand Up @@ -228,21 +237,38 @@ func TestParsePassageFromHtml(t *testing.T) {

t.Run("Nested HTML tags", func(t *testing.T) {
html := `<p><b>This is bold, <i>and this is italic.</i></b></p>`
expected := `*This is bold, _and this is italic._*`
expected := `<b>This is bold, <i>and this is italic.</i></b>`
if got := ParsePassageFromHtml("", html, ""); got != expected {
t.Errorf("ParsePassageFromHtml() = %v, want %v", got, expected)
}
})

t.Run("MarkdownV2 escaping", func(t *testing.T) {
// Note: We no longer escape explicitly in ParsePassageFromHtml as we rely on the platform
// to handle it later (via PostTelegram).
// However, returning raw characters like * might cause issues if not handled by platform.
// For now, we expect them to be returned raw.
html := `<p>This has special characters: *_. [hello](world)!</p>`
expected := `This has special characters: *_. [hello](world)!`
t.Run("Lists", func(t *testing.T) {
html := `<ul><li>Item 1</li><li>Item 2</li></ul>`
// Note: The ParseNodesForPassage appends newline after each Item.
// strings.TrimSpace removes the last newline.
// Item 1\nItem 2\n -> Item 1\nItem 2
expected := "• Item 1\n• Item 2"
if got := ParsePassageFromHtml("", html, ""); got != expected {
t.Errorf("ParsePassageFromHtml() = %v, want %v", got, expected)
t.Errorf("ParsePassageFromHtml() = %q, want %q", got, expected)
}
})

t.Run("Headers", func(t *testing.T) {
html := `<h1>Header</h1>`
// Code: \n\n<b>Header</b>\n
// TrimSpace -> <b>Header</b>
expected := "<b>Header</b>"
if got := ParsePassageFromHtml("", html, ""); got != expected {
t.Errorf("ParsePassageFromHtml() = %q, want %q", got, expected)
}
})

t.Run("Divs and escaping", func(t *testing.T) {
html := `<div>Text &lt;with&gt; symbols</div>`
expected := "Text &lt;with&gt; symbols"
if got := ParsePassageFromHtml("", html, ""); got != expected {
t.Errorf("ParsePassageFromHtml() = %q, want %q", got, expected)
}
})
}
Expand Down
Loading