Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add basic toc generation for pandoc #8911

Open
wants to merge 10 commits into
base: master
Choose a base branch
from
3 changes: 3 additions & 0 deletions markup/markup_config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ import (
"github.com/gohugoio/hugo/markup/asciidocext/asciidocext_config"
"github.com/gohugoio/hugo/markup/goldmark/goldmark_config"
"github.com/gohugoio/hugo/markup/highlight"
"github.com/gohugoio/hugo/markup/pandoc/pandoc_config"
"github.com/gohugoio/hugo/markup/tableofcontents"
"github.com/mitchellh/mapstructure"
)
Expand All @@ -39,6 +40,7 @@ type Config struct {

// Configuration for the Asciidoc external markdown engine.
AsciidocExt asciidocext_config.Config
Pandoc pandoc_config.Config
}

func Decode(cfg config.Provider) (conf Config, err error) {
Expand Down Expand Up @@ -105,4 +107,5 @@ var Default = Config{

Goldmark: goldmark_config.Default,
AsciidocExt: asciidocext_config.Default,
Pandoc: pandoc_config.Default,
}
184 changes: 181 additions & 3 deletions markup/pandoc/convert.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,16 @@
package pandoc

import (
"bytes"

"github.com/gohugoio/hugo/common/hexec"
"github.com/gohugoio/hugo/htesting"
"github.com/gohugoio/hugo/identity"
"golang.org/x/net/html"

"github.com/gohugoio/hugo/markup/converter"
"github.com/gohugoio/hugo/markup/internal"
"github.com/gohugoio/hugo/markup/tableofcontents"
)

// Provider is the package entry point.
Expand All @@ -38,17 +42,33 @@ func (p provider) New(cfg converter.ProviderConfig) (converter.Provider, error)
}), nil
}

type pandocResult struct {
converter.ResultRender
toc *tableofcontents.Fragments
}

func (r pandocResult) TableOfContents() *tableofcontents.Fragments {
return r.toc
}

type pandocConverter struct {
ctx converter.DocumentContext
cfg converter.ProviderConfig
}

func (c *pandocConverter) Convert(ctx converter.RenderContext) (converter.ResultRender, error) {
b, err := c.getPandocContent(ctx.Src, c.ctx)
contentWithToc, err := c.getPandocContent(ctx.Src, c.ctx)
if err != nil {
return nil, err
}
return converter.Bytes(b), nil
content, toc, err := c.extractTOC(contentWithToc)
if err != nil {
return nil, err
}
return pandocResult{
ResultRender: converter.Bytes(content),
toc: toc,
}, nil
}

func (c *pandocConverter) Supports(feature identity.Identity) bool {
Expand All @@ -64,7 +84,7 @@ func (c *pandocConverter) getPandocContent(src []byte, ctx converter.DocumentCon
" Leaving pandoc content unrendered.")
return src, nil
}
args := []string{"--mathjax"}
args := []string{"--mathjax", "--toc", "-s", "--metadata", "title=dummy"}
pagdot marked this conversation as resolved.
Show resolved Hide resolved
return internal.ExternallyRenderContent(c.cfg, ctx, src, binaryName, args)
}

Expand All @@ -77,6 +97,164 @@ func getPandocBinaryName() string {
return ""
}

// extractTOC extracts the toc from the given src html.
// It returns the html without the TOC, and the TOC data
func (a *pandocConverter) extractTOC(src []byte) ([]byte, *tableofcontents.Fragments, error) {
var buf bytes.Buffer
buf.Write(src)
node, err := html.Parse(&buf)
if err != nil {
return nil, nil, err
}

var (
f func(*html.Node) bool
body *html.Node
toc *tableofcontents.Fragments
toVisit []*html.Node
)

// find body
f = func(n *html.Node) bool {
if n.Type == html.ElementNode && n.Data == "body" {
body = n
return true
}
if n.FirstChild != nil {
toVisit = append(toVisit, n.FirstChild)
}
if n.NextSibling != nil && f(n.NextSibling) {
return true
}
for len(toVisit) > 0 {
nv := toVisit[0]
toVisit = toVisit[1:]
if f(nv) {
return true
}
}
return false
}
if !f(node) {
return nil, nil, err
}

// remove by pandoc generated title
f = func(n *html.Node) bool {
if n.Type == html.ElementNode && n.Data == "header" && attr(n, "id") == "title-block-header" {
n.Parent.RemoveChild(n)
return true
}
if n.FirstChild != nil {
toVisit = append(toVisit, n.FirstChild)
}
if n.NextSibling != nil && f(n.NextSibling) {
return true
}
for len(toVisit) > 0 {
nv := toVisit[0]
toVisit = toVisit[1:]
if f(nv) {
return true
}
}
return false
}
f(body)

// find toc
f = func(n *html.Node) bool {
if n.Type == html.ElementNode && n.Data == "nav" && attr(n, "id") == "TOC" {
toc = parseTOC(n)
if !a.cfg.MarkupConfig().Pandoc.PreserveTOC {
n.Parent.RemoveChild(n)
}
return true
}
if n.FirstChild != nil {
toVisit = append(toVisit, n.FirstChild)
}
if n.NextSibling != nil && f(n.NextSibling) {
return true
}
for len(toVisit) > 0 {
nv := toVisit[0]
toVisit = toVisit[1:]
if f(nv) {
return true
}
}
return false
}
f(body)
if err != nil {
return nil, nil, err
}
buf.Reset()
err = html.Render(&buf, body)
if err != nil {
return nil, nil, err
}
// ltrim <html><head></head><body>\n\n and rtrim \n\n</body></html> which are added by html.Render
res := buf.Bytes()[8:]
res = res[:len(res)-9]
return res, toc, nil
}

// parseTOC returns a TOC root from the given toc Node
func parseTOC(doc *html.Node) *tableofcontents.Fragments {
var (
toc tableofcontents.Builder
f func(*html.Node, int, int)
)
f = func(n *html.Node, row, level int) {
if n.Type == html.ElementNode {
switch n.Data {
case "ul":
if level == 0 {
row++
}
level++
f(n.FirstChild, row, level)
case "li":
for c := n.FirstChild; c != nil; c = c.NextSibling {
if c.Type != html.ElementNode || c.Data != "a" {
continue
}
href := attr(c, "href")[1:]
toc.AddAt(&tableofcontents.Heading{
Title: nodeContent(c),
ID: href,
}, row, level)
}
f(n.FirstChild, row, level)
}
}
if n.NextSibling != nil {
f(n.NextSibling, row, level)
}
}
f(doc.FirstChild, -1, 0)
return toc.Build()
}

func attr(node *html.Node, key string) string {
for _, a := range node.Attr {
if a.Key == key {
return a.Val
}
}
return ""
}

func nodeContent(node *html.Node) string {
var buf bytes.Buffer
for c := node.FirstChild; c != nil; c = c.NextSibling {
html.Render(&buf, c)
}
return buf.String()
}

// Supports returns whether Pandoc is installed on this computer.
func Supports() bool {
hasBin := getPandocBinaryName() != ""
Expand Down
85 changes: 85 additions & 0 deletions markup/pandoc/integration_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
// Copyright 2021 The Hugo Authors. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package pandoc_test

import (
"testing"

"github.com/gohugoio/hugo/hugolib"
)

func TestBasicConversion(t *testing.T) {
t.Parallel()

files := `
-- config.toml --
-- content/p1.md --
testContent
-- layouts/_default/single.html --
{{ .Content }}
`
b := hugolib.NewIntegrationTestBuilder(
hugolib.IntegrationTestConfig{
T: t,
TxtarString: files,
NeedsOsFS: true,
},
).Build()

b.AssertFileContent("public/p1/index.html", `<p>testContent</p>`)
}

func TestConversionWithHeader(t *testing.T) {
t.Parallel()

files := `
-- config.toml --
-- content/p1.md --
# testContent
-- layouts/_default/single.html --
{{ .Content }}
`
b := hugolib.NewIntegrationTestBuilder(
hugolib.IntegrationTestConfig{
T: t,
TxtarString: files,
NeedsOsFS: true,
},
).Build()

b.AssertFileContent("public/p1/index.html", `<h1 id="testcontent">testContent</h1>`)
}

func TestConversionWithExtractedToc(t *testing.T) {
t.Parallel()

files := `
-- config.toml --
-- content/p1.md --
# title 1
## title 2
-- layouts/_default/single.html --
{{ .TableOfContents }}
{{ .Content }}
`
b := hugolib.NewIntegrationTestBuilder(
hugolib.IntegrationTestConfig{
T: t,
TxtarString: files,
NeedsOsFS: true,
},
).Build()

b.AssertFileContent("public/p1/index.html", "<nav id=\"TableOfContents\">\n <ul>\n <li><a href=\"#title-2\">title 2</a></li>\n </ul>\n</nav>\n<h1 id=\"title-1\">title 1</h1>\n<h2 id=\"title-2\">title 2</h2>")
}
27 changes: 27 additions & 0 deletions markup/pandoc/pandoc_config/config.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
// Copyright 2020 The Hugo Authors. All rights reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

// Package pandocdoc_config holds pandoc related configuration.
package pandoc_config

var (
// Default holds Hugo's default pandoc configuration.
Default = Config{
PreserveTOC: false,
}
)

// Config configures pandoc.
type Config struct {
PreserveTOC bool
}