From a25e950fa74b8cb58c976a5b44ccd6fe5a0f5c42 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sebastian=20H=C3=B6ffner?= Date: Mon, 30 May 2022 03:34:12 +0200 Subject: [PATCH] markup: add --citeproc to pandoc converter Adds the citeproc filter to the pandoc converter. There are several PRs for it this feature already. However, I think simply adding `--citeproc` is the cleanest way to enable this feature, with the option to flesh it out later, e.g., in #7529. Some PRs and issues attempt adding more config options to Hugo which indirectly configure pandoc, but I think simply configuring Pandoc via Pandoc itself is simpler, as it is already possible with two YAML blocks -- one for Hugo, and one for Pandoc: --- title: This is the Hugo YAML block --- --- bibliography: assets/pandoc-yaml-block-bibliography.bib ... Document content with @citation! There are other useful options, e.g., #4800 attempts to use `nocite`, which works out of the box with this PR: --- title: This is the Hugo YAML block --- --- bibliography: assets/pandoc-yaml-block-bibliography.bib nocite: | @* ... Document content with no citations but a full bibliography: ## Bibliography Other useful options are `csl: ...` and `link-citations: true`, which set the path to a custom CSL file and create HTML links between the references and the bibliography. The following issues and PRs are related: - Add support for parsing citations and Jupyter notebooks via Pandoc and/or Goldmark extension #6101 Bundles multiple requests, this PR tackles citation parsing. - WIP: Bibliography with Pandoc #4800 Passes the frontmatter to Pandoc and still uses `--filter pandoc-citeproc` instead of `--citeproc`. - Allow configuring Pandoc #7529 That PR is much more extensive and might eventually supersede this PR, but I think --bibliography and --citeproc should be independent options (--bibliography should be optional and citeproc can always be specified). - Pandoc - allow citeproc extension to be invoked, with bibliography. #8610 Similar to #7529, #8610 adds a new config option to Hugo. I think passing --citeproc and letting the users decide on the metadata they want to pass to pandoc is better, albeit uglier. --- docs/content/en/content-management/formats.md | 62 +++++++- markup/pandoc/convert.go | 74 ++++++++- markup/pandoc/convert_test.go | 142 +++++++++++++++++- markup/pandoc/testdata/bibliography.bib | 6 + 4 files changed, 273 insertions(+), 11 deletions(-) create mode 100644 markup/pandoc/testdata/bibliography.bib diff --git a/docs/content/en/content-management/formats.md b/docs/content/en/content-management/formats.md index 303bb4596c9..32870d5b2d2 100644 --- a/docs/content/en/content-management/formats.md +++ b/docs/content/en/content-management/formats.md @@ -47,7 +47,7 @@ Hugo passes reasonable default arguments to these external helpers by default: - `asciidoctor`: `--no-header-footer -` - `rst2html`: `--leave-comments --initial-header-level=2` -- `pandoc`: `--mathjax` +- `pandoc`: `--mathjax` and, for pandoc >= 2.11, `--citeproc` {{% warning "Performance of External Helpers" %}} Because additional formats are external commands, generation performance will rely heavily on the performance of the external tool you are using. As this feature is still in its infancy, feedback is welcome. @@ -55,13 +55,13 @@ Because additional formats are external commands, generation performance will re ### External Helper AsciiDoc -[AsciiDoc](https://github.com/asciidoc/asciidoc) implementation EOLs in Jan 2020 and is no longer supported. -AsciiDoc development is being continued under [Asciidoctor](https://github.com/asciidoctor). The format AsciiDoc +[AsciiDoc](https://github.com/asciidoc/asciidoc) implementation EOLs in Jan 2020 and is no longer supported. +AsciiDoc development is being continued under [Asciidoctor](https://github.com/asciidoctor). The format AsciiDoc remains of course. Please continue with the implementation Asciidoctor. ### External Helper Asciidoctor -The Asciidoctor community offers a wide set of tools for the AsciiDoc format that can be installed additionally to Hugo. +The Asciidoctor community offers a wide set of tools for the AsciiDoc format that can be installed additionally to Hugo. [See the Asciidoctor docs for installation instructions](https://asciidoctor.org/docs/install-toolchain/). Make sure that also all optional extensions like `asciidoctor-diagram` or `asciidoctor-html5s` are installed if required. @@ -109,13 +109,65 @@ Example of how to set extensions and attributes: my-attribute-name = "my value" ``` -In a complex Asciidoctor environment it is sometimes helpful to debug the exact call to your external helper with all +In a complex Asciidoctor environment it is sometimes helpful to debug the exact call to your external helper with all parameters. Run Hugo with `-v`. You will get an output like ``` INFO 2019/12/22 09:08:48 Rendering book-as-pdf.adoc with C:\Ruby26-x64\bin\asciidoctor.bat using asciidoc args [--no-header-footer -r asciidoctor-html5s -b html5s -r asciidoctor-diagram --base-dir D:\prototypes\hugo_asciidoc_ddd\docs -a outdir=D:\prototypes\hugo_asciidoc_ddd\build -] ... ``` +### External Helper Pandoc + +[Pandoc](https://pandoc.org) is a universal document converter and can be used to convert markdown files. +In Hugo, Pandoc can be used for LaTeX-style math (the `--mathjax` command line option is provided): + +``` +--- +title: Math document +--- + +Some inline math: $a^2 + b^2 = c^2$. +``` + +This will render in your HTML as: + +``` +

Some inline math: \(a^2 + b^2 = c^2\)

+``` +You will have to [add MathJax](https://www.mathjax.org/#gettingstarted) to your template to properly render the math. + +For **Pandoc >= 2.11**, you can use [citations](https://pandoc.org/MANUAL.html#extension-citations). +One way is to employ [BibTeX files](https://en.wikibooks.org/wiki/LaTeX/Bibliography_Management#BibTeX) to cite: + +``` +--- +title: Citation document +--- +--- +bibliography: assets/bibliography.bib +... +This is a citation: @Doe2022 +``` + +Note that Hugo will **not** pass its metadata YAML block to Pandoc; however, it will pass the **second** meta data block, denoted with `---` and `...` to Pandoc. +Thus, all Pandoc settings should go there. + +You can also add all elements from a bibliography file (without citing them explicitly) using: + +``` +--- +title: My Publications +--- +--- +bibliography: assets/bibliography.bib +nocite: | + @* +... +``` + +It is also possible to provide a custom [CSL style](https://citationstyles.org/authors/) by passing `csl: path-to-style.csl` as a Pandoc option. + + ## Learn Markdown Markdown syntax is simple enough to learn in a single sitting. The following are excellent resources to get you up and running: diff --git a/markup/pandoc/convert.go b/markup/pandoc/convert.go index ae90cf41770..79614116422 100644 --- a/markup/pandoc/convert.go +++ b/markup/pandoc/convert.go @@ -15,12 +15,16 @@ package pandoc import ( + "bytes" + "strconv" + "strings" + "sync" + "github.com/gohugoio/hugo/common/hexec" "github.com/gohugoio/hugo/htesting" "github.com/gohugoio/hugo/identity" - "github.com/gohugoio/hugo/markup/internal" - "github.com/gohugoio/hugo/markup/converter" + "github.com/gohugoio/hugo/markup/internal" ) // Provider is the package entry point. @@ -65,6 +69,9 @@ func (c *pandocConverter) getPandocContent(src []byte, ctx converter.DocumentCon return src, nil } args := []string{"--mathjax"} + if supportsCitations(c.cfg) { + args = append(args[:], "--citeproc") + } return internal.ExternallyRenderContent(c.cfg, ctx, src, binaryName, args) } @@ -77,6 +84,69 @@ func getPandocBinaryName() string { return "" } +type pandocVersion struct { + major, minor int64 +} + +func (left pandocVersion) greaterThanOrEqual(right pandocVersion) bool { + return left.major > right.major || (left.major == right.major && left.minor >= right.minor) +} + +var versionOnce sync.Once +var foundPandocVersion pandocVersion + +// getPandocVersion parses the pandoc version output +func getPandocVersion(cfg converter.ProviderConfig) (pandocVersion, error) { + var err error + + versionOnce.Do(func() { + argsv := []any{"--version"} + + var out bytes.Buffer + argsv = append(argsv, hexec.WithStdout(&out)) + + cmd, err := cfg.Exec.New(pandocBinary, argsv...) + if err != nil { + cfg.Logger.Errorf("Could not call pandoc: %v", err) + foundPandocVersion = pandocVersion{0, 0} + return + } + + err = cmd.Run() + if err != nil { + cfg.Logger.Errorf("%s --version: %v", pandocBinary, err) + foundPandocVersion = pandocVersion{0, 0} + return + } + + outbytes := bytes.Replace(out.Bytes(), []byte("\r"), []byte(""), -1) + output := strings.Split(string(outbytes), "\n")[0] + // Split, e.g., "pandoc 2.5" into 2 and 5 and convert them to integers + versionStrings := strings.Split(strings.Split(output, " ")[1], ".") + majorVersion, err := strconv.ParseInt(versionStrings[0], 10, 64) + if err != nil { + println(err) + } + minorVersion, err := strconv.ParseInt(versionStrings[1], 10, 64) + if err != nil { + println(err) + } + foundPandocVersion = pandocVersion{majorVersion, minorVersion} + }) + + return foundPandocVersion, err +} + +// SupportsCitations returns true for pandoc versions >= 2.11, which include citeproc +func supportsCitations(cfg converter.ProviderConfig) bool { + if Supports() { + foundPandocVersion, err := getPandocVersion(cfg) + supportsCitations := foundPandocVersion.greaterThanOrEqual(pandocVersion{2, 11}) && err == nil + return supportsCitations + } + return false +} + // Supports returns whether Pandoc is installed on this computer. func Supports() bool { hasBin := getPandocBinaryName() != "" diff --git a/markup/pandoc/convert_test.go b/markup/pandoc/convert_test.go index f549d5f4ff8..aa362552091 100644 --- a/markup/pandoc/convert_test.go +++ b/markup/pandoc/convert_test.go @@ -25,18 +25,152 @@ import ( qt "github.com/frankban/quicktest" ) -func TestConvert(t *testing.T) { +func setupTestConverter(t *testing.T) (*qt.C, converter.Converter, converter.ProviderConfig) { if !Supports() { t.Skip("pandoc not installed") } c := qt.New(t) sc := security.DefaultConfig sc.Exec.Allow = security.NewWhitelist("pandoc") - p, err := Provider.New(converter.ProviderConfig{Exec: hexec.New(sc), Logger: loggers.NewErrorLogger()}) + cfg := converter.ProviderConfig{Exec: hexec.New(sc), Logger: loggers.NewErrorLogger()} + p, err := Provider.New(cfg) c.Assert(err, qt.IsNil) conv, err := p.New(converter.DocumentContext{}) c.Assert(err, qt.IsNil) - b, err := conv.Convert(converter.RenderContext{Src: []byte("testContent")}) + return c, conv, cfg +} + +func TestConvert(t *testing.T) { + c, conv, _ := setupTestConverter(t) + output, err := conv.Convert(converter.RenderContext{Src: []byte("testContent")}) + c.Assert(err, qt.IsNil) + c.Assert(string(output.Bytes()), qt.Equals, "

testContent

\n") +} + +func runCiteprocTest(t *testing.T, content string, expected string) { + c, conv, cfg := setupTestConverter(t) + if !supportsCitations(cfg) { + t.Skip("pandoc does not support citations") + } + output, err := conv.Convert(converter.RenderContext{Src: []byte(content)}) c.Assert(err, qt.IsNil) - c.Assert(string(b.Bytes()), qt.Equals, "

testContent

\n") + c.Assert(string(output.Bytes()), qt.Equals, expected) +} + +func TestGetPandocVersionCallTwice(t *testing.T) { + c, _, cfg := setupTestConverter(t) + + version1, err1 := getPandocVersion(cfg) + version2, err2 := getPandocVersion(cfg) + c.Assert(version1, qt.Equals, version2) + c.Assert(err1, qt.IsNil) + c.Assert(err2, qt.IsNil) +} + +func TestPandocVersionEquality(t *testing.T) { + c := qt.New(t) + v1 := pandocVersion{1, 0} + v2 := pandocVersion{2, 0} + v3 := pandocVersion{2, 2} + v4 := pandocVersion{1, 2} + v5 := pandocVersion{2, 11} + + // 1 >= 1 -> true + c.Assert(v1.greaterThanOrEqual(v1), qt.IsTrue) + + // 1 >= 2 -> false, 2 >= 1 -> tru + c.Assert(v1.greaterThanOrEqual(v2), qt.IsFalse) + c.Assert(v2.greaterThanOrEqual(v1), qt.IsTrue) + + // 2.0 >= 2.2 -> false, 2.2 >= 2.0 -> true + c.Assert(v2.greaterThanOrEqual(v3), qt.IsFalse) + c.Assert(v3.greaterThanOrEqual(v2), qt.IsTrue) + + // 2.2 >= 1.2 -> true, 1.2 >= 2.2 -> false + c.Assert(v3.greaterThanOrEqual(v4), qt.IsTrue) + c.Assert(v4.greaterThanOrEqual(v3), qt.IsFalse) + + // 2.11 >= 2.2 -> true, 2.2 >= 2.11 -> false + c.Assert(v5.greaterThanOrEqual(v3), qt.IsTrue) + c.Assert(v3.greaterThanOrEqual(v5), qt.IsFalse) +} + +func TestCiteprocWithHugoMeta(t *testing.T) { + content := ` +--- +title: Test +published: 2022-05-30 +--- +testContent +` + expected := "

testContent

\n" + runCiteprocTest(t, content, expected) +} + +func TestCiteprocWithPandocMeta(t *testing.T) { + content := ` +--- +--- +--- +... +testContent +` + expected := "

testContent

\n" + runCiteprocTest(t, content, expected) +} + +func TestCiteprocWithBibliography(t *testing.T) { + content := ` +--- +--- +--- +bibliography: testdata/bibliography.bib +... +testContent +` + expected := "

testContent

\n" + runCiteprocTest(t, content, expected) +} + +func TestCiteprocWithExplicitCitation(t *testing.T) { + content := ` +--- +--- +--- +bibliography: testdata/bibliography.bib +... +@Doe2022 +` + expected := `

Doe and Mustermann +(2022)

+
+
+Doe, Jane, and Max Mustermann. 2022. “A Treatise on Hugo +Tests.” Hugo Websites. +
+
+` + runCiteprocTest(t, content, expected) +} + +func TestCiteprocWithNocite(t *testing.T) { + content := ` +--- +--- +--- +bibliography: testdata/bibliography.bib +nocite: | + @* +... +` + expected := `
+
+Doe, Jane, and Max Mustermann. 2022. “A Treatise on Hugo +Tests.” Hugo Websites. +
+
+` + runCiteprocTest(t, content, expected) } diff --git a/markup/pandoc/testdata/bibliography.bib b/markup/pandoc/testdata/bibliography.bib new file mode 100644 index 00000000000..8fc1019b435 --- /dev/null +++ b/markup/pandoc/testdata/bibliography.bib @@ -0,0 +1,6 @@ +@article{Doe2022, + author = "Jane Doe and Max Mustermann", + title = "A Treatise on Hugo Tests", + journal = "Hugo Websites", + year = "2022", +}