diff --git a/pkg/app/passage.go b/pkg/app/passage.go index 5a5a7d3..2c610bc 100644 --- a/pkg/app/passage.go +++ b/pkg/app/passage.go @@ -8,6 +8,7 @@ import ( "log" "net/url" "strings" + stdhtml "html" "golang.org/x/net/html" @@ -59,20 +60,30 @@ func isNextSiblingBr(node *html.Node) bool { } func ParseNodesForPassage(node *html.Node) string { - var text string var parts []string for child := node.FirstChild; child != nil; child = child.NextSibling { - parts = append(parts, text) + // Filter out footnotes sections/cross-refs if they appear as divs + if child.Type == html.ElementNode { + for _, attr := range child.Attr { + if attr.Key == "class" { + if strings.Contains(attr.Val, "footnotes") || strings.Contains(attr.Val, "cross-refs") { + continue + } + } + } + } switch tag := child.Data; tag { case "span": + // Keep existing logic for span (likely poetry lines in legacy/scraped HTML) childText := ParseNodesForPassage(child) parts = append(parts, childText) if len(strings.TrimSpace(childText)) > 0 && !isNextSiblingBr(child) { parts = append(parts, "\n") } case "sup": + // Handle superscripts (verse numbers/footnotes) isFootnote := func(node *html.Node) bool { for _, attr := range node.Attr { if attr.Key == "class" && attr.Val == "footnote" { @@ -85,67 +96,62 @@ func ParseNodesForPassage(node *html.Node) string { break } childText := ParseNodesForPassage(child) + // Use TelegramSuperscript for unicode conversion if len(childText) > 0 { - parts = append(parts, fmt.Sprintf("^%s^", childText)) + parts = append(parts, platform.TelegramSuperscript(childText)) } break case "p": parts = append(parts, ParseNodesForPassage(child)) - break - case "b": - parts = append(parts, platform.TelegramBold(ParseNodesForPassage(child))) - case "i": - parts = append(parts, platform.TelegramItalics(ParseNodesForPassage(child))) - break + parts = append(parts, "\n\n") + case "b", "strong": + parts = append(parts, fmt.Sprintf("%s", ParseNodesForPassage(child))) + case "i", "em": + parts = append(parts, fmt.Sprintf("%s", ParseNodesForPassage(child))) + case "h1", "h2", "h3", "h4", "h5", "h6": + // Ignore "Footnotes" or "Cross references" headers + headerText := ParseNodesForPassage(child) + if headerText == "Footnotes" || headerText == "Cross references" { + continue + } + parts = append(parts, fmt.Sprintf("\n\n%s\n", headerText)) + case "ul", "ol": + parts = append(parts, ParseNodesForPassage(child)) + case "li": + parts = append(parts, fmt.Sprintf("• %s\n", ParseNodesForPassage(child))) case "br": parts = append(parts, "\n") - break + case "div": + parts = append(parts, ParseNodesForPassage(child)) default: - parts = append(parts, child.Data) + if child.Type == html.TextNode { + parts = append(parts, stdhtml.EscapeString(child.Data)) + } else if child.Type == html.ElementNode { + // Recurse for unknown elements to preserve content + parts = append(parts, ParseNodesForPassage(child)) + } } } - text = strings.Join(parts, "") - - if node.Data == "h1" || node.Data == "h2" || node.Data == "h3" || node.Data == "h4" { - text = fmt.Sprintf("*%s*", text) - } - return text + return strings.Join(parts, "") } func GetPassage(ref string, doc *html.Node, version string) string { - filtNodes := utils.FilterTree(doc, func(child *html.Node) bool { - switch tag := child.Data; tag { - case "h1": - fallthrough - case "h2": - fallthrough - case "h3": - fallthrough - case "h4": - if child.FirstChild.Data == "Footnotes" || child.FirstChild.Data == "Cross references" { - return false - } - fallthrough - case "p": - return true - } - return false - }) + // Replaced FilterTree with direct parsing of the root node + // This allows handling arbitrary structure (divs, lists) returned by the API - textBlocks := utils.MapNodeListToString(filtNodes, ParseNodesForPassage) + text := ParseNodesForPassage(doc) var passage strings.Builder if len(ref) > 0 { - refString := fmt.Sprintf("_%s_ (%s)", ref, version) + // Use HTML formatting for reference + refString := fmt.Sprintf("%s (%s)", ref, version) passage.WriteString(refString) } - for _, block := range textBlocks { - passage.WriteString("\n") - passage.WriteString(block) - } + passage.WriteString("\n") + passage.WriteString(strings.TrimSpace(text)) return passage.String() } @@ -158,6 +164,11 @@ func ParsePassageFromHtml(ref string, rawHtml string, version string) string { return rawHtml } + // html.Parse returns a doc with html->body structure. + // GetPassage -> ParseNodesForPassage will traverse it. + // We might want to find 'body' to avoid processing 'head'? + // ParseNodesForPassage iterates children. doc->html->body. + // We can let it recurse. return strings.TrimSpace(GetPassage(ref, doc, version)) } @@ -181,6 +192,7 @@ func GetBiblePassageFallback(env def.SessionData) def.SessionData { // Attempt to get the passage env.Res.Message = GetPassage(ref, passageNode, config.Version) + env.Res.ParseMode = def.TELEGRAM_PARSE_MODE_HTML return env } @@ -224,6 +236,7 @@ func GetBiblePassage(env def.SessionData) def.SessionData { if len(resp.Verse) > 0 { env.Res.Message = ParsePassageFromHtml(env.Msg.Message, resp.Verse, config.Version) + env.Res.ParseMode = def.TELEGRAM_PARSE_MODE_HTML return env } } diff --git a/pkg/app/passage_test.go b/pkg/app/passage_test.go index e1dbc6b..29fdaeb 100644 --- a/pkg/app/passage_test.go +++ b/pkg/app/passage_test.go @@ -112,6 +112,10 @@ func TestGetBiblePassage(t *testing.T) { if len(env.Res.Message) < 10 { t.Errorf("Expected passage text, got '%s'", env.Res.Message) } + // Verify ParseMode is set + if env.Res.ParseMode != "HTML" { + t.Errorf("Expected ParseMode 'HTML', got '%s'", env.Res.ParseMode) + } }) t.Run("Empty", func(t *testing.T) { @@ -166,21 +170,26 @@ func TestGetBiblePassage(t *testing.T) { if !strings.Contains(env.Res.Message, "In the beginning") { t.Errorf("Expected fallback passage content, got '%s'", env.Res.Message) } + // Fallback should also use HTML mode + if env.Res.ParseMode != "HTML" { + t.Errorf("Expected ParseMode 'HTML' in fallback, got '%s'", env.Res.ParseMode) + } }) } func TestParsePassageFromHtml(t *testing.T) { t.Run("Valid HTML with superscript", func(t *testing.T) { html := `

12 But to all who did receive him, who believed in his name, he gave the right to become children of God,

` - expected := `^12 ^But to all who did receive him, who believed in his name, he gave the right to become children of God,` + // Updated expectation: unicode superscripts and HTML formatting + expected := `¹²But to all who did receive him, who believed in his name, he gave the right to become children of God,` if got := ParsePassageFromHtml("", html, ""); got != expected { - t.Errorf("ParsePassageFromHtml() = %v, want %v", got, expected) + t.Errorf("ParsePassageFromHtml() = %s, want %s", got, expected) } }) t.Run("HTML with italics", func(t *testing.T) { html := `

This is italic.

` - expected := `_This is italic._` + expected := `This is italic.` if got := ParsePassageFromHtml("", html, ""); got != expected { t.Errorf("ParsePassageFromHtml() = %v, want %v", got, expected) } @@ -188,7 +197,7 @@ func TestParsePassageFromHtml(t *testing.T) { t.Run("HTML with bold", func(t *testing.T) { html := `

This is bold.

` - expected := `*This is bold.*` + expected := `This is bold.` if got := ParsePassageFromHtml("", html, ""); got != expected { t.Errorf("ParsePassageFromHtml() = %v, want %v", got, expected) } @@ -228,21 +237,38 @@ func TestParsePassageFromHtml(t *testing.T) { t.Run("Nested HTML tags", func(t *testing.T) { html := `

This is bold, and this is italic.

` - expected := `*This is bold, _and this is italic._*` + expected := `This is bold, and this is italic.` if got := ParsePassageFromHtml("", html, ""); got != expected { t.Errorf("ParsePassageFromHtml() = %v, want %v", got, expected) } }) - t.Run("MarkdownV2 escaping", func(t *testing.T) { - // Note: We no longer escape explicitly in ParsePassageFromHtml as we rely on the platform - // to handle it later (via PostTelegram). - // However, returning raw characters like * might cause issues if not handled by platform. - // For now, we expect them to be returned raw. - html := `

This has special characters: *_. [hello](world)!

` - expected := `This has special characters: *_. [hello](world)!` + t.Run("Lists", func(t *testing.T) { + html := `` + // Note: The ParseNodesForPassage appends newline after each Item. + // strings.TrimSpace removes the last newline. + // Item 1\nItem 2\n -> Item 1\nItem 2 + expected := "• Item 1\n• Item 2" if got := ParsePassageFromHtml("", html, ""); got != expected { - t.Errorf("ParsePassageFromHtml() = %v, want %v", got, expected) + t.Errorf("ParsePassageFromHtml() = %q, want %q", got, expected) + } + }) + + t.Run("Headers", func(t *testing.T) { + html := `

Header

` + // Code: \n\nHeader\n + // TrimSpace -> Header + expected := "Header" + if got := ParsePassageFromHtml("", html, ""); got != expected { + t.Errorf("ParsePassageFromHtml() = %q, want %q", got, expected) + } + }) + + t.Run("Divs and escaping", func(t *testing.T) { + html := `
Text <with> symbols
` + expected := "Text <with> symbols" + if got := ParsePassageFromHtml("", html, ""); got != expected { + t.Errorf("ParsePassageFromHtml() = %q, want %q", got, expected) } }) }