From 56bd0c65c48ff0efdd884e731fd0fa5b04c5beef Mon Sep 17 00:00:00 2001 From: Thuc Le Date: Sun, 12 Mar 2017 17:19:49 +0700 Subject: [PATCH 01/18] Update the purpose of the fork --- README.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 902a7e1..9742584 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,5 @@ -go get rsc.io/pdf +# Purpose of the fork -http://godoc.org/rsc.io/pdf +This fork of rsc.io/pdf extends the package API with: + + - Implement the method GetPlainText() from object Page. Use to get plain text content (without format) From 612c19099809c99bd4595ee711560b748bd0ae2a Mon Sep 17 00:00:00 2001 From: Thuc Le Date: Sun, 12 Mar 2017 20:33:21 +0700 Subject: [PATCH 02/18] Change the import path --- pdfpasswd/main.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pdfpasswd/main.go b/pdfpasswd/main.go index 53c8ef1..57fa88f 100644 --- a/pdfpasswd/main.go +++ b/pdfpasswd/main.go @@ -12,7 +12,7 @@ import ( "log" "os" - "rsc.io/pdf" + "github.com/ledongthuc/pdf" ) var ( From b95967f4ea5d295d627d5767e70b1b4ddbd05f7d Mon Sep 17 00:00:00 2001 From: Thuc Le Date: Sun, 12 Mar 2017 21:50:42 +0700 Subject: [PATCH 03/18] Add function to get plain text from Page --- page.go | 56 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) diff --git a/page.go b/page.go index 9c7d688..c7a80ac 100644 --- a/page.go +++ b/page.go @@ -5,6 +5,7 @@ package pdf import ( + "bytes" "fmt" "strings" ) @@ -401,6 +402,61 @@ type gstate struct { CTM matrix } +// GetPlainText returns the page's all text without format. +// - seperator parameter used to add chars to split part not at the same paragraphs. "\n" is good way to try. +func (p Page) GetPlainText(seperator string) string { + strm := p.V.Key("Contents") + + var textBuilder bytes.Buffer + showText := func(s string) { + _, err := textBuilder.WriteString(s) + if err != nil { + panic(err) + } + } + + Interpret(strm, func(stk *Stack, op string) { + n := stk.Len() + args := make([]Value, n) + for i := n - 1; i >= 0; i-- { + args[i] = stk.Pop() + } + + switch op { + default: + return + case "T*": // move to start of next line + showText(seperator) + case "\"": // set spacing, move to next line, and show text + if len(args) != 3 { + panic("bad \" operator") + } + fallthrough + case "'": // move to next line and show text + if len(args) != 1 { + panic("bad ' operator") + } + fallthrough + case "Tj": // show text + if len(args) != 1 { + panic("bad Tj operator") + } + showText(args[0].RawString()) + showText(seperator) + case "TJ": // show text, allowing individual glyph positioning + v := args[0] + for i := 0; i < v.Len(); i++ { + x := v.Index(i) + if x.Kind() == String { + showText(x.RawString()) + showText(seperator) + } + } + } + }) + return textBuilder.String() +} + // Content returns the page's content. func (p Page) Content() Content { strm := p.V.Key("Contents") From daace13046a1d74935e3d38767d83943d9f63155 Mon Sep 17 00:00:00 2001 From: Thuc Le Date: Mon, 13 Mar 2017 15:53:10 +0700 Subject: [PATCH 04/18] Remove comment of rsc --- read.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/read.go b/read.go index eb8b9aa..f3ad3ed 100644 --- a/read.go +++ b/read.go @@ -44,7 +44,7 @@ // the package. Equally important, traversal of other PDF data structures can be implemented // in other packages as needed. // -package pdf // import "rsc.io/pdf" +package pdf // BUG(rsc): The package is incomplete, although it has been used successfully on some // large real-world PDF files. From 0e30ba212a76647ebf25d15ff1e0a50ca5f44971 Mon Sep 17 00:00:00 2001 From: Thuc Le Date: Mon, 13 Mar 2017 16:58:03 +0700 Subject: [PATCH 05/18] Update --- README.md | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/README.md b/README.md index 9742584..cfa0d9c 100644 --- a/README.md +++ b/README.md @@ -3,3 +3,27 @@ This fork of rsc.io/pdf extends the package API with: - Implement the method GetPlainText() from object Page. Use to get plain text content (without format) + +## How to read all text from PDF: + +I write an example function to read file from PATH and return the content of PDF + + ```golang + func readPdf(path string) (string, error) { + r, err := pdf.Open(path) + if err != nil { + return "", err + } + totalPage := r.NumPage() + + var textBuilder bytes.Buffer + for pageIndex := 1; pageIndex <= totalPage; pageIndex++ { + p := r.Page(pageIndex) + if p.V.IsNull() { + continue + } + textBuilder.WriteString(p.GetPlainText("\n")) + } + return textBuilder.String(), nil + } + ``` From ffbf376ba4dfa5945fd52fd1dbce9468a6f98342 Mon Sep 17 00:00:00 2001 From: Thuc Le Date: Mon, 13 Mar 2017 16:58:42 +0700 Subject: [PATCH 06/18] Correct the language of code block example --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index cfa0d9c..97860b7 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ This fork of rsc.io/pdf extends the package API with: I write an example function to read file from PATH and return the content of PDF - ```golang + ```go func readPdf(path string) (string, error) { r, err := pdf.Open(path) if err != nil { From f8f8fe4f600c77e16df2d1121cec055de53192c7 Mon Sep 17 00:00:00 2001 From: Thuc Le Date: Mon, 13 Mar 2017 16:59:16 +0700 Subject: [PATCH 07/18] Update --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 97860b7..7894a4a 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ This fork of rsc.io/pdf extends the package API with: I write an example function to read file from PATH and return the content of PDF - ```go + ``` func readPdf(path string) (string, error) { r, err := pdf.Open(path) if err != nil { From d6cc51520d9495c45daa3ca4b69af29f67ce53bb Mon Sep 17 00:00:00 2001 From: Thuc Le Date: Mon, 13 Mar 2017 17:00:37 +0700 Subject: [PATCH 08/18] Update README.md --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 7894a4a..587e6e6 100644 --- a/README.md +++ b/README.md @@ -8,7 +8,7 @@ This fork of rsc.io/pdf extends the package API with: I write an example function to read file from PATH and return the content of PDF - ``` +```golang func readPdf(path string) (string, error) { r, err := pdf.Open(path) if err != nil { @@ -26,4 +26,4 @@ I write an example function to read file from PATH and return the content of PDF } return textBuilder.String(), nil } - ``` +``` From f3eb144855fbc8a739170bd4f661c75e8979f5f9 Mon Sep 17 00:00:00 2001 From: Thuc Le Date: Tue, 14 Mar 2017 10:46:20 +0700 Subject: [PATCH 09/18] Update README.md --- README.md | 55 ++++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 38 insertions(+), 17 deletions(-) diff --git a/README.md b/README.md index 587e6e6..167133b 100644 --- a/README.md +++ b/README.md @@ -9,21 +9,42 @@ This fork of rsc.io/pdf extends the package API with: I write an example function to read file from PATH and return the content of PDF ```golang - func readPdf(path string) (string, error) { - r, err := pdf.Open(path) - if err != nil { - return "", err - } - totalPage := r.NumPage() - - var textBuilder bytes.Buffer - for pageIndex := 1; pageIndex <= totalPage; pageIndex++ { - p := r.Page(pageIndex) - if p.V.IsNull() { - continue - } - textBuilder.WriteString(p.GetPlainText("\n")) - } - return textBuilder.String(), nil - } +package main + +import ( + "bytes" + "fmt" + + "github.com/ledongthuc/pdf" +) + +func main() { + content, err := readPdf("test.pdf") // Read local pdf file + if err != nil { + panic(err) + } + fmt.Println(content) + return +} + +func readPdf(path string) (string, error) { + r, err := pdf.Open(path) + if err != nil { + return "", err + } + totalPage := r.NumPage() + + var textBuilder bytes.Buffer + for pageIndex := 1; pageIndex <= totalPage; pageIndex++ { + p := r.Page(pageIndex) + if p.V.IsNull() { + continue + } + textBuilder.WriteString(p.GetPlainText("\n")) + } + return textBuilder.String(), nil +} ``` + +## Demo +![Run example](https://i.gyazo.com/01fbc539e9872593e0ff6bac7e954e6d.gif) From 66da04eb56a952ee1e98370590103cee2c61b3ff Mon Sep 17 00:00:00 2001 From: Thuc Le Date: Tue, 14 Mar 2017 10:51:04 +0700 Subject: [PATCH 10/18] Update README.md --- README.md | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 167133b..7916174 100644 --- a/README.md +++ b/README.md @@ -6,7 +6,10 @@ This fork of rsc.io/pdf extends the package API with: ## How to read all text from PDF: -I write an example function to read file from PATH and return the content of PDF +1. Get the library with command `go get -u github.com/ledongthuc/pdf` + + +2. I write an example function to read file from PATH and return the content of PDF ```golang package main From 4ff10c65aed6fff2bf0eda2611c97e990caa6377 Mon Sep 17 00:00:00 2001 From: Thuc Le Date: Mon, 19 Jun 2017 07:13:38 +0700 Subject: [PATCH 11/18] Add space when get text from Content() Based on pull request of https://github.com/rsc/pdf/pull/8 but never merged. So I need it :( --- page.go | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/page.go b/page.go index c7a80ac..7e34a8e 100644 --- a/page.go +++ b/page.go @@ -474,17 +474,14 @@ func (p Page) Content() Content { Trm := matrix{{g.Tfs * g.Th, 0, 0}, {0, g.Tfs, 0}, {0, g.Trise, 1}}.mul(g.Tm).mul(g.CTM) w0 := g.Tf.Width(int(s[n])) n++ - if ch != ' ' { - f := g.Tf.BaseFont() - if i := strings.Index(f, "+"); i >= 0 { - f = f[i+1:] - } - text = append(text, Text{f, Trm[0][0], Trm[2][0], Trm[2][1], w0 / 1000 * Trm[0][0], string(ch)}) + + f := g.Tf.BaseFont() + if i := strings.Index(f, "+"); i >= 0 { + f = f[i+1:] } + text = append(text, Text{f, Trm[0][0], Trm[2][0], Trm[2][1], w0 / 1000 * Trm[0][0], string(ch)}) + tx := w0/1000*g.Tfs + g.Tc - if ch == ' ' { - tx += g.Tw - } tx *= g.Th g.Tm = matrix{{1, 0, 0}, {0, 1, 0}, {tx, 0, 1}}.mul(g.Tm) } From 1e8ebfa8c2834dd64e93fd39717a7d8d9edeb897 Mon Sep 17 00:00:00 2001 From: Thuc Le Date: Mon, 19 Jun 2017 07:28:55 +0700 Subject: [PATCH 12/18] Add readme content to get texts with style --- README.md | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/README.md b/README.md index 7916174..b30005c 100644 --- a/README.md +++ b/README.md @@ -49,5 +49,35 @@ func readPdf(path string) (string, error) { } ``` +## How to read all text with styles from PDF + +```golang +func readPdf2(path string) (string, error) { + r, err := pdf.Open(path) + if err != nil { + return "", err + } + totalPage := r.NumPage() + + for pageIndex := 1; pageIndex <= totalPage; pageIndex++ { + p := r.Page(pageIndex) + if p.V.IsNull() { + continue + } + var lastTextStyle pdf.Text + texts := p.Content().Text + for _, text := range texts { + if isSameSentence(text, lastTextStyle) { + lastTextStyle.S = lastTextStyle.S + text.S + } else { + fmt.Printf("Font: %s, Font-size: %f, x: %f, y: %f, content: %s \n", lastTextStyle.Font, lastTextStyle.FontSize, lastTextStyle.X, lastTextStyle.Y, lastTextStyle.S) + lastTextStyle = text + } + } + } + return "", nil +} +``` + ## Demo ![Run example](https://i.gyazo.com/01fbc539e9872593e0ff6bac7e954e6d.gif) From fbd875511ef56a0e84d6a779ad105687e86707c7 Mon Sep 17 00:00:00 2001 From: Thuc Le Date: Sun, 2 Jul 2017 14:37:26 +0700 Subject: [PATCH 13/18] Update --- page.go | 1 + 1 file changed, 1 insertion(+) diff --git a/page.go b/page.go index 7e34a8e..7bb3d43 100644 --- a/page.go +++ b/page.go @@ -620,6 +620,7 @@ func (p Page) Content() Content { g.Tm = matrix{{1, 0, 0}, {0, 1, 0}, {tx, 0, 1}}.mul(g.Tm) } } + showText("\n") case "TL": // set text leading if len(args) != 1 { From 11f580bd1d786f4d02ee0696a966dc2c687b718e Mon Sep 17 00:00:00 2001 From: Rob Archibald Date: Thu, 17 Aug 2017 19:29:14 -0700 Subject: [PATCH 14/18] Add GetPlainText to Reader. Fix Encoder method --- README.md | 29 +++++------ page.go | 152 +++++++++++++++++++++++++++++++++++++++--------------- 2 files changed, 123 insertions(+), 58 deletions(-) diff --git a/README.md b/README.md index b30005c..76f33e1 100644 --- a/README.md +++ b/README.md @@ -1,15 +1,17 @@ -# Purpose of the fork +# PDF Reader -This fork of rsc.io/pdf extends the package API with: +A simple Go library which enables reading PDF files. Forked from https://github.com/rsc/pdf - - Implement the method GetPlainText() from object Page. Use to get plain text content (without format) +Features + - Get plain text content (without format) + - Get Content (including all font and formatting information) -## How to read all text from PDF: +## Install: -1. Get the library with command `go get -u github.com/ledongthuc/pdf` +`go get -u github.com/ledongthuc/pdf` -2. I write an example function to read file from PATH and return the content of PDF +## Read plain text ```golang package main @@ -35,21 +37,14 @@ func readPdf(path string) (string, error) { if err != nil { return "", err } - totalPage := r.NumPage() - var textBuilder bytes.Buffer - for pageIndex := 1; pageIndex <= totalPage; pageIndex++ { - p := r.Page(pageIndex) - if p.V.IsNull() { - continue - } - textBuilder.WriteString(p.GetPlainText("\n")) - } - return textBuilder.String(), nil + var buf bytes.Buffer + buf.ReadFrom(p.GetPlainText()) + return buf.String(), nil } ``` -## How to read all text with styles from PDF +## Read all text with styles from PDF ```golang func readPdf2(path string) (string, error) { diff --git a/page.go b/page.go index 7bb3d43..e330bc4 100644 --- a/page.go +++ b/page.go @@ -7,6 +7,7 @@ package pdf import ( "bytes" "fmt" + "io" "strings" ) @@ -56,6 +57,24 @@ func (r *Reader) NumPage() int { return int(r.Trailer().Key("Root").Key("Pages").Key("Count").Int64()) } +// GetPlainText returns all the text in the PDF file +func (r *Reader) GetPlainText() io.Reader { + pages := r.NumPage() + var buf bytes.Buffer + fonts := make(map[string]*Font) + for i := 1; i < pages; i++ { + p := r.Page(i) + for _, name := range p.Fonts() { // cache fonts so we don't continually parse charmap + if _, ok := fonts[name]; !ok { + f := p.Font(name) + fonts[name] = &f + } + } + buf.WriteString(p.GetPlainText(fonts)) + } + return &buf +} + func (p Page) findInherited(key string) Value { for v := p.V; !v.IsNull(); v = v.Key("Parent") { if r := v.Key(key); !r.IsNull() { @@ -87,13 +106,14 @@ func (p Page) Fonts() []string { // Font returns the font with the given name associated with the page. func (p Page) Font(name string) Font { - return Font{p.Resources().Key("Font").Key(name)} + return Font{p.Resources().Key("Font").Key(name), nil} } // A Font represent a font in a PDF file. // The methods interpret a Font dictionary stored in V. type Font struct { - V Value + V Value + enc TextEncoding } // BaseFont returns the font's name (BaseFont property). @@ -134,6 +154,13 @@ func (f Font) Width(code int) float64 { // Encoder returns the encoding between font code point sequences and UTF-8. func (f Font) Encoder() TextEncoding { + if f.enc == nil { // caching the Encoder so we don't have to continually parse charmap + f.enc = f.getEncoder() + } + return f.enc +} + +func (f Font) getEncoder() TextEncoding { enc := f.V.Key("Encoding") switch enc.Kind() { case Name: @@ -143,8 +170,7 @@ func (f Font) Encoder() TextEncoding { case "MacRomanEncoding": return &byteEncoder{&macRomanEncoding} case "Identity-H": - // TODO: Should be big-endian UCS-2 decoder - return &nopEncoder{} + return f.charmapEncoding() default: println("unknown encoding", enc.Name()) return &nopEncoder{} @@ -152,14 +178,16 @@ func (f Font) Encoder() TextEncoding { case Dict: return &dictEncoder{enc.Key("Differences")} case Null: - // ok, try ToUnicode + return f.charmapEncoding() default: println("unexpected encoding", enc.String()) return &nopEncoder{} } +} +func (f *Font) charmapEncoding() TextEncoding { toUnicode := f.V.Key("ToUnicode") - if toUnicode.Kind() == Dict { + if toUnicode.Kind() == Stream { m := readCmap(toUnicode) if m == nil { return &nopEncoder{} @@ -228,42 +256,64 @@ func (e *byteEncoder) Decode(raw string) (text string) { return string(r) } +type byteRange struct { + low string + high string +} + +type bfchar struct { + orig string + repl string +} + +type bfrange struct { + lo string + hi string + dst Value +} + type cmap struct { - space [4][][2]string + space [4][]byteRange // codespace range bfrange []bfrange + bfchar []bfchar } func (m *cmap) Decode(raw string) (text string) { var r []rune Parse: for len(raw) > 0 { - for n := 1; n <= 4 && n <= len(raw); n++ { - for _, space := range m.space[n-1] { - if space[0] <= raw[:n] && raw[:n] <= space[1] { + for n := 1; n <= 4 && n <= len(raw); n++ { // number of digits in character replacement (1-4 possible) + for _, space := range m.space[n-1] { // find matching codespace Ranges for number of digits + if space.low <= raw[:n] && raw[:n] <= space.high { // see if value is in range text := raw[:n] raw = raw[n:] - for _, bf := range m.bfrange { - if len(bf.lo) == n && bf.lo <= text && text <= bf.hi { - if bf.dst.Kind() == String { - s := bf.dst.RawString() - if bf.lo != text { + for _, bfchar := range m.bfchar { // check for matching bfchar + if len(bfchar.orig) == n && bfchar.orig == text { + r = append(r, []rune(utf16Decode(bfchar.repl))...) + continue Parse + } + } + for _, bfrange := range m.bfrange { // check for matching bfrange + if len(bfrange.lo) == n && bfrange.lo <= text && text <= bfrange.hi { + if bfrange.dst.Kind() == String { + s := bfrange.dst.RawString() + if bfrange.lo != text { // value isn't at the beginning of the range so scale result b := []byte(s) - b[len(b)-1] += text[len(text)-1] - bf.lo[len(bf.lo)-1] + b[len(b)-1] += text[len(text)-1] - bfrange.lo[len(bfrange.lo)-1] // increment last byte by difference s = string(b) } r = append(r, []rune(utf16Decode(s))...) continue Parse } - if bf.dst.Kind() == Array { - fmt.Printf("array %v\n", bf.dst) + if bfrange.dst.Kind() == Array { + fmt.Printf("array %v\n", bfrange.dst) } else { - fmt.Printf("unknown dst %v\n", bf.dst) + fmt.Printf("unknown dst %v\n", bfrange.dst) } r = append(r, noRune) continue Parse } } - fmt.Printf("no text for %q", text) r = append(r, noRune) continue Parse } @@ -276,12 +326,6 @@ Parse: return string(r) } -type bfrange struct { - lo string - hi string - dst Value -} - func readCmap(toUnicode Value) *cmap { n := -1 var m cmap @@ -292,9 +336,8 @@ func readCmap(toUnicode Value) *cmap { } switch op { case "findresource": - category := stk.Pop() - key := stk.Pop() - fmt.Println("findresource", key, category) + stk.Pop() // category + stk.Pop() // key stk.Push(newDict()) case "begincmap": stk.Push(newDict()) @@ -315,9 +358,19 @@ func readCmap(toUnicode Value) *cmap { ok = false return } - m.space[len(lo)-1] = append(m.space[len(lo)-1], [2]string{lo, hi}) + m.space[len(lo)-1] = append(m.space[len(lo)-1], byteRange{lo, hi}) } n = -1 + case "beginbfchar": + n = int(stk.Pop().Int64()) + case "endbfchar": + if n < 0 { + panic("missing beginbfchar") + } + for i := 0; i < n; i++ { + repl, orig := stk.Pop().RawString(), stk.Pop().RawString() + m.bfchar = append(m.bfchar, bfchar{orig, repl}) + } case "beginbfrange": n = int(stk.Pop().Int64()) case "endbfrange": @@ -329,10 +382,9 @@ func readCmap(toUnicode Value) *cmap { m.bfrange = append(m.bfrange, bfrange{srcLo, srcHi, dst}) } case "defineresource": - category := stk.Pop().Name() + stk.Pop().Name() // category value := stk.Pop() - key := stk.Pop().Name() - fmt.Println("defineresource", key, value, category) + stk.Pop().Name() // key stk.Push(value) default: println("interp\t", op) @@ -403,15 +455,26 @@ type gstate struct { } // GetPlainText returns the page's all text without format. -// - seperator parameter used to add chars to split part not at the same paragraphs. "\n" is good way to try. -func (p Page) GetPlainText(seperator string) string { +// fonts can be passed in (to improve parsing performance) or left nil +func (p Page) GetPlainText(fonts map[string]*Font) string { strm := p.V.Key("Contents") + var enc TextEncoding = &nopEncoder{} + + if fonts == nil { + fonts = make(map[string]*Font) + for _, font := range p.Fonts() { + f := p.Font(font) + fonts[font] = &f + } + } var textBuilder bytes.Buffer showText := func(s string) { - _, err := textBuilder.WriteString(s) - if err != nil { - panic(err) + for _, ch := range enc.Decode(s) { + _, err := textBuilder.WriteRune(ch) + if err != nil { + panic(err) + } } } @@ -426,7 +489,16 @@ func (p Page) GetPlainText(seperator string) string { default: return case "T*": // move to start of next line - showText(seperator) + showText("\n") + case "Tf": // set text font and size + if len(args) != 2 { + panic("bad TL") + } + if font, ok := fonts[args[0].Name()]; ok { + enc = font.Encoder() + } else { + enc = &nopEncoder{} + } case "\"": // set spacing, move to next line, and show text if len(args) != 3 { panic("bad \" operator") @@ -442,14 +514,12 @@ func (p Page) GetPlainText(seperator string) string { panic("bad Tj operator") } showText(args[0].RawString()) - showText(seperator) case "TJ": // show text, allowing individual glyph positioning v := args[0] for i := 0; i < v.Len(); i++ { x := v.Index(i) if x.Kind() == String { showText(x.RawString()) - showText(seperator) } } } From 1d9eac6f6f570db38fb77500cac4c707108be0f4 Mon Sep 17 00:00:00 2001 From: Rob Archibald Date: Thu, 17 Aug 2017 19:46:54 -0700 Subject: [PATCH 15/18] Aligning with rsc --- README.md | 8 ++++---- page.go | 16 +++++++++------- pdfpasswd/main.go | 2 +- 3 files changed, 14 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index 76f33e1..58d3ce3 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # PDF Reader -A simple Go library which enables reading PDF files. Forked from https://github.com/rsc/pdf +A simple Go library which enables reading PDF files. Features - Get plain text content (without format) @@ -8,7 +8,7 @@ Features ## Install: -`go get -u github.com/ledongthuc/pdf` +`go get -u github.com/rsc/pdf` ## Read plain text @@ -20,7 +20,7 @@ import ( "bytes" "fmt" - "github.com/ledongthuc/pdf" + "github.com/rsc/pdf" ) func main() { @@ -75,4 +75,4 @@ func readPdf2(path string) (string, error) { ``` ## Demo -![Run example](https://i.gyazo.com/01fbc539e9872593e0ff6bac7e954e6d.gif) +![Run example](https://i.gyazo.com/01fbc539e9872593e0ff6bac7e954e6d.gif) \ No newline at end of file diff --git a/page.go b/page.go index e330bc4..0280f6d 100644 --- a/page.go +++ b/page.go @@ -544,14 +544,17 @@ func (p Page) Content() Content { Trm := matrix{{g.Tfs * g.Th, 0, 0}, {0, g.Tfs, 0}, {0, g.Trise, 1}}.mul(g.Tm).mul(g.CTM) w0 := g.Tf.Width(int(s[n])) n++ - - f := g.Tf.BaseFont() - if i := strings.Index(f, "+"); i >= 0 { - f = f[i+1:] + if ch != ' ' { + f := g.Tf.BaseFont() + if i := strings.Index(f, "+"); i >= 0 { + f = f[i+1:] + } + text = append(text, Text{f, Trm[0][0], Trm[2][0], Trm[2][1], w0 / 1000 * Trm[0][0], string(ch)}) } - text = append(text, Text{f, Trm[0][0], Trm[2][0], Trm[2][1], w0 / 1000 * Trm[0][0], string(ch)}) - tx := w0/1000*g.Tfs + g.Tc + if ch == ' ' { + tx += g.Tw + } tx *= g.Th g.Tm = matrix{{1, 0, 0}, {0, 1, 0}, {tx, 0, 1}}.mul(g.Tm) } @@ -690,7 +693,6 @@ func (p Page) Content() Content { g.Tm = matrix{{1, 0, 0}, {0, 1, 0}, {tx, 0, 1}}.mul(g.Tm) } } - showText("\n") case "TL": // set text leading if len(args) != 1 { diff --git a/pdfpasswd/main.go b/pdfpasswd/main.go index 57fa88f..188eaeb 100644 --- a/pdfpasswd/main.go +++ b/pdfpasswd/main.go @@ -12,7 +12,7 @@ import ( "log" "os" - "github.com/ledongthuc/pdf" + "github.com/rsc/pdf" ) var ( From 2f60b68cefa1f36be48d4dbbc868316436b51470 Mon Sep 17 00:00:00 2001 From: Rob Archibald Date: Thu, 17 Aug 2017 19:50:47 -0700 Subject: [PATCH 16/18] making pointer receiver to fix caching --- page.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/page.go b/page.go index 0280f6d..269e4ab 100644 --- a/page.go +++ b/page.go @@ -153,7 +153,7 @@ func (f Font) Width(code int) float64 { } // Encoder returns the encoding between font code point sequences and UTF-8. -func (f Font) Encoder() TextEncoding { +func (f *Font) Encoder() TextEncoding { if f.enc == nil { // caching the Encoder so we don't have to continually parse charmap f.enc = f.getEncoder() } From 681e79c6562862b93e743331cd6216fc429f1614 Mon Sep 17 00:00:00 2001 From: Rob Archibald Date: Thu, 17 Aug 2017 23:55:29 -0700 Subject: [PATCH 17/18] errors as values, not panic --- lex.go | 52 ++++++++++++------------- page.go | 115 ++++++++++++++++++++++++++++++-------------------------- ps.go | 33 ++++++++++------ read.go | 74 ++++++++++++++++++++---------------- 4 files changed, 147 insertions(+), 127 deletions(-) diff --git a/lex.go b/lex.go index ee73fd9..09c6485 100644 --- a/lex.go +++ b/lex.go @@ -7,9 +7,10 @@ package pdf import ( - "fmt" "io" "strconv" + + "github.com/pkg/errors" ) // A token is a PDF token in the input stream, one of the following Go types: @@ -78,11 +79,7 @@ func (b *buffer) readByte() byte { return c } -func (b *buffer) errorf(format string, args ...interface{}) { - panic(fmt.Errorf(format, args...)) -} - -func (b *buffer) reload() bool { +func (b *buffer) reload() (bool, error) { n := cap(b.buf) - int(b.offset%int64(cap(b.buf))) n, err := b.r.Read(b.buf[:n]) if n == 0 && err != nil { @@ -90,20 +87,19 @@ func (b *buffer) reload() bool { b.pos = 0 if b.allowEOF && err == io.EOF { b.eof = true - return false + return false, nil } - b.errorf("malformed PDF: reading at offset %d: %v", b.offset, err) - return false + return false, errors.Errorf("malformed PDF: reading at offset %d: %v", b.offset, err) } b.offset += int64(n) b.buf = b.buf[:n] b.pos = 0 - return true + return true, nil } func (b *buffer) seekForward(offset int64) { for b.offset < offset { - if !b.reload() { + if ok, _ := b.reload(); !ok { return } } @@ -174,8 +170,7 @@ func (b *buffer) readToken() token { default: if isDelim(c) { - b.errorf("unexpected delimiter %#q", rune(c)) - return nil + return errors.Errorf("unexpected delimiter %#q", rune(c)) } b.unreadByte() return b.readKeyword() @@ -200,8 +195,7 @@ func (b *buffer) readHexString() token { } x := unhex(c)<<4 | unhex(c2) if x < 0 { - b.errorf("malformed hex string %c %c %s", c, c2, b.buf[b.pos:]) - break + return errors.Errorf("malformed hex string %c %c %s", c, c2, b.buf[b.pos:]) } tmp = append(tmp, byte(x)) } @@ -241,8 +235,7 @@ Loop: case '\\': switch c = b.readByte(); c { default: - b.errorf("invalid escape sequence \\%c", c) - tmp = append(tmp, '\\', c) + return errors.Errorf("invalid escape sequence \\%c", c) case 'n': tmp = append(tmp, '\n') case 'r': @@ -273,7 +266,7 @@ Loop: x = x*8 + int(c-'0') } if x > 255 { - b.errorf("invalid octal escape \\%03o", x) + return errors.Errorf("invalid octal escape \\%03o", x) } tmp = append(tmp, byte(x)) } @@ -294,7 +287,7 @@ func (b *buffer) readName() token { if c == '#' { x := unhex(b.readByte())<<4 | unhex(b.readByte()) if x < 0 { - b.errorf("malformed name") + return errors.Errorf("malformed name") } tmp = append(tmp, byte(x)) continue @@ -325,13 +318,13 @@ func (b *buffer) readKeyword() token { case isInteger(s): x, err := strconv.ParseInt(s, 10, 64) if err != nil { - b.errorf("invalid integer %s", s) + return errors.Errorf("invalid integer %s", s) } return x case isReal(s): x, err := strconv.ParseFloat(s, 64) if err != nil { - b.errorf("invalid real %s", s) + return errors.Errorf("invalid real %s", s) } return x } @@ -420,12 +413,15 @@ func (b *buffer) readObject() object { case "[": return b.readArray() } - b.errorf("unexpected keyword %q parsing object", kw) - return nil + return errors.Errorf("unexpected keyword %q parsing object", kw) } if str, ok := tok.(string); ok && b.key != nil && b.objptr.id != 0 { - tok = decryptString(b.key, b.useAES, b.objptr, str) + var err error + tok, err = decryptString(b.key, b.useAES, b.objptr, str) + if err != nil { + return err + } } if !b.allowObjptr { @@ -446,8 +442,7 @@ func (b *buffer) readObject() object { if _, ok := obj.(stream); !ok { tok4 := b.readToken() if tok4 != keyword("endobj") { - b.errorf("missing endobj after indirect object definition") - b.unreadToken(tok4) + return errors.Errorf("missing endobj after indirect object definition") } } b.objptr = old @@ -482,8 +477,7 @@ func (b *buffer) readDict() object { } n, ok := tok.(name) if !ok { - b.errorf("unexpected non-name key %T(%v) parsing dictionary", tok, tok) - continue + return errors.Errorf("unexpected non-name key %T(%v) parsing dictionary", tok, tok) } x[n] = b.readObject() } @@ -506,7 +500,7 @@ func (b *buffer) readDict() object { case '\n': // ok default: - b.errorf("stream keyword not followed by newline") + return errors.Errorf("stream keyword not followed by newline") } return stream{x, b.objptr, b.readOffset()} diff --git a/page.go b/page.go index 269e4ab..e6cc3ad 100644 --- a/page.go +++ b/page.go @@ -9,6 +9,8 @@ import ( "fmt" "io" "strings" + + "github.com/pkg/errors" ) // A Page represent a single page in a PDF file. @@ -58,7 +60,7 @@ func (r *Reader) NumPage() int { } // GetPlainText returns all the text in the PDF file -func (r *Reader) GetPlainText() io.Reader { +func (r *Reader) GetPlainText() (io.Reader, error) { pages := r.NumPage() var buf bytes.Buffer fonts := make(map[string]*Font) @@ -70,9 +72,16 @@ func (r *Reader) GetPlainText() io.Reader { fonts[name] = &f } } - buf.WriteString(p.GetPlainText(fonts)) + r, err := p.GetPlainText(fonts) + if err != nil { + return nil, err + } + _, err = buf.ReadFrom(r) + if err != nil { + return nil, err + } } - return &buf + return &buf, nil } func (p Page) findInherited(key string) Value { @@ -188,8 +197,8 @@ func (f Font) getEncoder() TextEncoding { func (f *Font) charmapEncoding() TextEncoding { toUnicode := f.V.Key("ToUnicode") if toUnicode.Kind() == Stream { - m := readCmap(toUnicode) - if m == nil { + m, err := readCmap(toUnicode) + if err != nil { return &nopEncoder{} } return m @@ -326,14 +335,10 @@ Parse: return string(r) } -func readCmap(toUnicode Value) *cmap { +func readCmap(toUnicode Value) (*cmap, error) { n := -1 var m cmap - ok := true - Interpret(toUnicode, func(stk *Stack, op string) { - if !ok { - return - } + err := Interpret(toUnicode, func(stk *Stack, op string) error { switch op { case "findresource": stk.Pop() // category @@ -347,16 +352,12 @@ func readCmap(toUnicode Value) *cmap { n = int(stk.Pop().Int64()) case "endcodespacerange": if n < 0 { - println("missing begincodespacerange") - ok = false - return + return errors.New("missing begincodespacerange") } for i := 0; i < n; i++ { hi, lo := stk.Pop().RawString(), stk.Pop().RawString() if len(lo) == 0 || len(lo) != len(hi) { - println("bad codespace range") - ok = false - return + return errors.New("bad codespace range") } m.space[len(lo)-1] = append(m.space[len(lo)-1], byteRange{lo, hi}) } @@ -365,7 +366,7 @@ func readCmap(toUnicode Value) *cmap { n = int(stk.Pop().Int64()) case "endbfchar": if n < 0 { - panic("missing beginbfchar") + return errors.New("missing beginbfchar") } for i := 0; i < n; i++ { repl, orig := stk.Pop().RawString(), stk.Pop().RawString() @@ -375,7 +376,7 @@ func readCmap(toUnicode Value) *cmap { n = int(stk.Pop().Int64()) case "endbfrange": if n < 0 { - panic("missing beginbfrange") + return errors.New("missing beginbfrange") } for i := 0; i < n; i++ { dst, srcHi, srcLo := stk.Pop(), stk.Pop().RawString(), stk.Pop().RawString() @@ -389,11 +390,12 @@ func readCmap(toUnicode Value) *cmap { default: println("interp\t", op) } - }) - if !ok { return nil + }) + if err != nil { + return nil, err } - return &m + return &m, err } type matrix [3][3]float64 @@ -456,7 +458,7 @@ type gstate struct { // GetPlainText returns the page's all text without format. // fonts can be passed in (to improve parsing performance) or left nil -func (p Page) GetPlainText(fonts map[string]*Font) string { +func (p Page) GetPlainText(fonts map[string]*Font) (io.Reader, error) { strm := p.V.Key("Contents") var enc TextEncoding = &nopEncoder{} @@ -471,14 +473,11 @@ func (p Page) GetPlainText(fonts map[string]*Font) string { var textBuilder bytes.Buffer showText := func(s string) { for _, ch := range enc.Decode(s) { - _, err := textBuilder.WriteRune(ch) - if err != nil { - panic(err) - } + textBuilder.WriteRune(ch) } } - Interpret(strm, func(stk *Stack, op string) { + err := Interpret(strm, func(stk *Stack, op string) error { n := stk.Len() args := make([]Value, n) for i := n - 1; i >= 0; i-- { @@ -487,12 +486,12 @@ func (p Page) GetPlainText(fonts map[string]*Font) string { switch op { default: - return + return nil case "T*": // move to start of next line showText("\n") case "Tf": // set text font and size if len(args) != 2 { - panic("bad TL") + return errors.New("bad TL") } if font, ok := fonts[args[0].Name()]; ok { enc = font.Encoder() @@ -501,17 +500,17 @@ func (p Page) GetPlainText(fonts map[string]*Font) string { } case "\"": // set spacing, move to next line, and show text if len(args) != 3 { - panic("bad \" operator") + return errors.New("bad \" operator") } fallthrough case "'": // move to next line and show text if len(args) != 1 { - panic("bad ' operator") + return errors.New("bad ' operator") } fallthrough case "Tj": // show text if len(args) != 1 { - panic("bad Tj operator") + return errors.New("bad Tj operator") } showText(args[0].RawString()) case "TJ": // show text, allowing individual glyph positioning @@ -523,12 +522,16 @@ func (p Page) GetPlainText(fonts map[string]*Font) string { } } } + return nil }) - return textBuilder.String() + if err != nil { + return nil, err + } + return &textBuilder, nil } // Content returns the page's content. -func (p Page) Content() Content { +func (p Page) Content() (Content, error) { strm := p.V.Key("Contents") var enc TextEncoding = &nopEncoder{} @@ -562,7 +565,7 @@ func (p Page) Content() Content { var rect []Rect var gstack []gstate - Interpret(strm, func(stk *Stack, op string) { + err := Interpret(strm, func(stk *Stack, op string) error { n := stk.Len() args := make([]Value, n) for i := n - 1; i >= 0; i-- { @@ -571,11 +574,11 @@ func (p Page) Content() Content { switch op { default: //fmt.Println(op, args) - return + return nil case "cm": // update g.CTM if len(args) != 6 { - panic("bad g.Tm") + return errors.New("bad g.Tm") } var m matrix for i := 0; i < 6; i++ { @@ -601,7 +604,7 @@ func (p Page) Content() Content { case "re": // append rectangle to path if len(args) != 4 { - panic("bad re") + return errors.New("bad re") } x, y, w, h := args[0].Float64(), args[1].Float64(), args[2].Float64(), args[3].Float64() rect = append(rect, Rect{Point{x, y}, Point{x + w, y + h}}) @@ -627,19 +630,19 @@ func (p Page) Content() Content { case "Tc": // set character spacing if len(args) != 1 { - panic("bad g.Tc") + return errors.New("bad g.Tc") } g.Tc = args[0].Float64() case "TD": // move text position and set leading if len(args) != 2 { - panic("bad Td") + return errors.New("bad Td") } g.Tl = -args[1].Float64() fallthrough case "Td": // move text position if len(args) != 2 { - panic("bad Td") + return errors.New("bad Td") } tx := args[0].Float64() ty := args[1].Float64() @@ -649,7 +652,7 @@ func (p Page) Content() Content { case "Tf": // set text font and size if len(args) != 2 { - panic("bad TL") + return errors.New("bad TL") } f := args[0].Name() g.Tf = p.Font(f) @@ -662,7 +665,7 @@ func (p Page) Content() Content { case "\"": // set spacing, move to next line, and show text if len(args) != 3 { - panic("bad \" operator") + return errors.New("bad \" operator") } g.Tw = args[0].Float64() g.Tc = args[1].Float64() @@ -670,7 +673,7 @@ func (p Page) Content() Content { fallthrough case "'": // move to next line and show text if len(args) != 1 { - panic("bad ' operator") + return errors.New("bad ' operator") } x := matrix{{1, 0, 0}, {0, 1, 0}, {0, -g.Tl, 1}} g.Tlm = x.mul(g.Tlm) @@ -678,7 +681,7 @@ func (p Page) Content() Content { fallthrough case "Tj": // show text if len(args) != 1 { - panic("bad Tj operator") + return errors.New("bad Tj operator") } showText(args[0].RawString()) @@ -696,13 +699,13 @@ func (p Page) Content() Content { case "TL": // set text leading if len(args) != 1 { - panic("bad TL") + return errors.New("bad TL") } g.Tl = args[0].Float64() case "Tm": // set text matrix and line matrix if len(args) != 6 { - panic("bad g.Tm") + return errors.New("bad g.Tm") } var m matrix for i := 0; i < 6; i++ { @@ -714,30 +717,34 @@ func (p Page) Content() Content { case "Tr": // set text rendering mode if len(args) != 1 { - panic("bad Tr") + return errors.New("bad Tr") } g.Tmode = int(args[0].Int64()) case "Ts": // set text rise if len(args) != 1 { - panic("bad Ts") + return errors.New("bad Ts") } g.Trise = args[0].Float64() case "Tw": // set word spacing if len(args) != 1 { - panic("bad g.Tw") + return errors.New("bad g.Tw") } g.Tw = args[0].Float64() case "Tz": // set horizontal text scaling if len(args) != 1 { - panic("bad Tz") + return errors.New("bad Tz") } g.Th = args[0].Float64() / 100 } + return nil }) - return Content{text, rect} + if err != nil { + return Content{}, err + } + return Content{text, rect}, nil } // TextVertical implements sort.Interface for sorting @@ -754,7 +761,7 @@ func (x TextVertical) Less(i, j int) bool { return x[i].X < x[j].X } -// TextVertical implements sort.Interface for sorting +// TextHorizontal implements sort.Interface for sorting // a slice of Text values in horizontal order, left to right, // and then top to bottom within a column. type TextHorizontal []Text diff --git a/ps.go b/ps.go index 90c551e..4d5bac2 100644 --- a/ps.go +++ b/ps.go @@ -7,6 +7,8 @@ package pdf import ( "fmt" "io" + + "github.com/pkg/errors" ) // A Stack represents a stack of values. @@ -34,7 +36,7 @@ func (stk *Stack) Pop() Value { } func newDict() Value { - return Value{nil, objptr{}, make(dict)} + return Value{r: nil, ptr: objptr{}, data: make(dict)} } // Interpret interprets the content in a stream as a basic PostScript program, @@ -51,7 +53,7 @@ func newDict() Value { // // There is no support for executable blocks, among other limitations. // -func Interpret(strm Value, do func(stk *Stack, op string)) { +func Interpret(strm Value, do func(stk *Stack, op string) error) error { rd := strm.Reader() b := newBuffer(rd, 0) b.allowEOF = true @@ -65,6 +67,9 @@ Reading: if tok == io.EOF { break } + if err, ok := tok.(error); ok { + return err + } if kw, ok := tok.(keyword); ok { switch kw { case "null", "[", "]", "<<", ">>": @@ -72,43 +77,46 @@ Reading: default: for i := len(dicts) - 1; i >= 0; i-- { if v, ok := dicts[i][name(kw)]; ok { - stk.Push(Value{nil, objptr{}, v}) + stk.Push(Value{r: nil, ptr: objptr{}, data: v}) continue Reading } } - do(&stk, string(kw)) + err := do(&stk, string(kw)) + if err != nil { + return err + } continue case "dict": stk.Pop() - stk.Push(Value{nil, objptr{}, make(dict)}) + stk.Push(Value{r: nil, ptr: objptr{}, data: make(dict)}) continue case "currentdict": if len(dicts) == 0 { - panic("no current dictionary") + return errors.New("no current dictionary") } - stk.Push(Value{nil, objptr{}, dicts[len(dicts)-1]}) + stk.Push(Value{r: nil, ptr: objptr{}, data: dicts[len(dicts)-1]}) continue case "begin": d := stk.Pop() if d.Kind() != Dict { - panic("cannot begin non-dict") + return errors.New("cannot begin non-dict") } dicts = append(dicts, d.data.(dict)) continue case "end": if len(dicts) <= 0 { - panic("mismatched begin/end") + return errors.New("mismatched begin/end") } dicts = dicts[:len(dicts)-1] continue case "def": if len(dicts) <= 0 { - panic("def without open dict") + return errors.New("def without open dict") } val := stk.Pop() key, ok := stk.Pop().data.(name) if !ok { - panic("def of non-name") + return errors.New("def of non-name") } dicts[len(dicts)-1][key] = val.data continue @@ -119,8 +127,9 @@ Reading: } b.unreadToken(tok) obj := b.readObject() - stk.Push(Value{nil, objptr{}, obj}) + stk.Push(Value{r: nil, ptr: objptr{}, data: obj}) } + return nil } type seqReader struct { diff --git a/read.go b/read.go index f3ad3ed..892c823 100644 --- a/read.go +++ b/read.go @@ -73,6 +73,8 @@ import ( "os" "sort" "strconv" + + "github.com/pkg/errors" ) // A Reader is a single PDF file open for reading. @@ -93,10 +95,6 @@ type xref struct { offset int64 } -func (r *Reader) errorf(format string, args ...interface{}) { - panic(fmt.Errorf(format, args...)) -} - // Open opens a file for reading. func Open(file string) (*Reader, error) { // TODO: Deal with closing file. @@ -188,7 +186,7 @@ func NewReaderEncrypted(f io.ReaderAt, size int64, pw func() string) (*Reader, e // Trailer returns the file's Trailer value. func (r *Reader) Trailer() Value { - return Value{r, r.trailerptr, r.trailer} + return Value{r: r, ptr: r.trailerptr, data: r.trailer} } func readXref(r *Reader, b *buffer) ([]xref, objptr, dict, error) { @@ -244,7 +242,7 @@ func readXrefStream(r *Reader, b *buffer) ([]xref, objptr, dict, error) { return nil, objptr{}, nil, fmt.Errorf("malformed PDF: xref prev stream not found: %v", objfmt(obj)) } prevoff = prevstrm.hdr["Prev"] - prev := Value{r, objptr{}, prevstrm} + prev := Value{r: r, ptr: objptr{}, data: prevstrm} if prev.Kind() != Stream { return nil, objptr{}, nil, fmt.Errorf("malformed PDF: xref prev stream is not stream: %v", prev) } @@ -288,7 +286,7 @@ func readXrefStreamData(r *Reader, strm stream, table []xref, size int64) ([]xre return nil, fmt.Errorf("invalid W array %v", objfmt(ww)) } - v := Value{r, objptr{}, strm} + v := Value{r: r, ptr: objptr{}, data: strm} wtotal := 0 for _, wid := range w { wtotal += wid @@ -444,6 +442,7 @@ type Value struct { r *Reader ptr objptr data interface{} + err error } // IsNull reports whether the value is a null. It is equivalent to Kind() == Null. @@ -451,6 +450,11 @@ func (v Value) IsNull() bool { return v.data == nil } +// IsError reports whether the value is an error. It is equivalent to v.err != nil +func (v Value) IsError() bool { + return v.err != nil +} + // A ValueKind specifies the kind of data underlying a Value. type ValueKind int @@ -653,6 +657,9 @@ func (v Value) Name() string { // If v is a stream, Key applies to the stream's header dictionary. // If v.Kind() != Dict and v.Kind() != Stream, Key returns a null Value. func (v Value) Key(key string) Value { + if v.IsError() { + return v + } x, ok := v.data.(dict) if !ok { strm, ok := v.data.(stream) @@ -688,6 +695,9 @@ func (v Value) Keys() []string { // If v.Kind() != Array or if i is outside the array bounds, // Index returns a null Value. func (v Value) Index(i int) Value { + if v.IsError() { + return v + } x, ok := v.data.(array) if !ok || i < 0 || i >= len(x) { return Value{} @@ -720,15 +730,15 @@ func (r *Reader) resolve(parent objptr, x interface{}) Value { Search: for { if strm.Kind() != Stream { - panic("not a stream") + return Value{err: errors.New("not a stream")} } if strm.Key("Type").Name() != "ObjStm" { - panic("not an object stream") + return Value{err: errors.New("not an object stream")} } n := int(strm.Key("N").Int64()) first := strm.Key("First").Int64() if first == 0 { - panic("missing First") + return Value{err: errors.New("missing First")} } b := newBuffer(strm.Reader(), 0) b.allowEOF = true @@ -743,7 +753,7 @@ func (r *Reader) resolve(parent objptr, x interface{}) Value { } ext := strm.Key("Extends") if ext.Kind() != Stream { - panic("cannot find object in stream") + return Value{err: errors.New("cannot find object in stream")} } strm = ext } @@ -754,11 +764,10 @@ func (r *Reader) resolve(parent objptr, x interface{}) Value { obj = b.readObject() def, ok := obj.(objdef) if !ok { - panic(fmt.Errorf("loading %v: found %T instead of objdef", ptr, obj)) - return Value{} + return Value{err: fmt.Errorf("loading %v: found %T instead of objdef", ptr, obj)} } if def.ptr != ptr { - panic(fmt.Errorf("loading %v: found %v", ptr, def.ptr)) + return Value{err: fmt.Errorf("loading %v: found %v", ptr, def.ptr)} } x = def.obj } @@ -767,11 +776,11 @@ func (r *Reader) resolve(parent objptr, x interface{}) Value { switch x := x.(type) { case nil, bool, int64, float64, name, dict, array, stream: - return Value{r, parent, x} + return Value{r: r, ptr: parent, data: x} case string: - return Value{r, parent, x} + return Value{r: r, ptr: parent, data: x} default: - panic(fmt.Errorf("unexpected value type %T in resolve", x)) + return Value{err: fmt.Errorf("unexpected value type %T in resolve", x)} } } @@ -791,6 +800,9 @@ func (e *errorReadCloser) Close() error { // If v.Kind() != Stream, Reader returns a ReadCloser that // responds to all reads with a ``stream not present'' error. func (v Value) Reader() io.ReadCloser { + if v.IsError() { + return &errorReadCloser{errors.Wrap(v.err, "stream not present")} + } x, ok := v.data.(stream) if !ok { return &errorReadCloser{fmt.Errorf("stream not present")} @@ -804,7 +816,7 @@ func (v Value) Reader() io.ReadCloser { param := v.Key("DecodeParms") switch filter.Kind() { default: - panic(fmt.Errorf("unsupported filter %v", filter)) + return &errorReadCloser{fmt.Errorf("unsupported filter %v", filter)} case Null: // ok case Name: @@ -821,11 +833,11 @@ func (v Value) Reader() io.ReadCloser { func applyFilter(rd io.Reader, name string, param Value) io.Reader { switch name { default: - panic("unknown filter " + name) + return &errorReadCloser{errors.New("unknown filter " + name)} case "FlateDecode": zr, err := zlib.NewReader(rd) if err != nil { - panic(err) + return &errorReadCloser{err} } pred := param.Key("Predictor") if pred.Kind() == Null { @@ -834,8 +846,7 @@ func applyFilter(rd io.Reader, name string, param Value) io.Reader { columns := param.Key("Columns").Int64() switch pred.Int64() { default: - fmt.Println("unknown predictor", pred) - panic("pred") + return &errorReadCloser{errors.Errorf("unknown predictor %v", pred)} case 12: return &pngUpReader{r: zr, hist: make([]byte, 1+columns), tmp: make([]byte, 1+columns)} } @@ -1026,17 +1037,16 @@ func cryptKey(key []byte, useAES bool, ptr objptr) []byte { return h.Sum(nil) } -func decryptString(key []byte, useAES bool, ptr objptr, x string) string { +func decryptString(key []byte, useAES bool, ptr objptr, x string) (string, error) { key = cryptKey(key, useAES, ptr) if useAES { - panic("AES not implemented") - } else { - c, _ := rc4.NewCipher(key) - data := []byte(x) - c.XORKeyStream(data, data) - x = string(data) + return "", errors.New("AES not implemented") } - return x + + c, _ := rc4.NewCipher(key) + data := []byte(x) + c.XORKeyStream(data, data) + return string(data), nil } func decryptStream(key []byte, useAES bool, ptr objptr, rd io.Reader) io.Reader { @@ -1044,7 +1054,7 @@ func decryptStream(key []byte, useAES bool, ptr objptr, rd io.Reader) io.Reader if useAES { cb, err := aes.NewCipher(key) if err != nil { - panic("AES: " + err.Error()) + return &errorReadCloser{errors.New("AES: " + err.Error())} } iv := make([]byte, 16) io.ReadFull(rd, iv) @@ -1052,7 +1062,7 @@ func decryptStream(key []byte, useAES bool, ptr objptr, rd io.Reader) io.Reader rd = &cbcReader{cbc: cbc, rd: rd, buf: make([]byte, 16)} } else { c, _ := rc4.NewCipher(key) - rd = &cipher.StreamReader{c, rd} + rd = &cipher.StreamReader{S: c, R: rd} } return rd } From 8a4da024a28e744e948e98a8a544a384e74e4ed8 Mon Sep 17 00:00:00 2001 From: Rob Archibald Date: Fri, 18 Aug 2017 09:13:21 -0700 Subject: [PATCH 18/18] More errors as values --- lex.go | 78 +++++++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 58 insertions(+), 20 deletions(-) diff --git a/lex.go b/lex.go index 09c6485..ff331ae 100644 --- a/lex.go +++ b/lex.go @@ -67,16 +67,18 @@ func (b *buffer) seek(offset int64) { b.unread = b.unread[:0] } -func (b *buffer) readByte() byte { +func (b *buffer) readByte() (byte, error) { if b.pos >= len(b.buf) { - b.reload() + if _, err := b.reload(); err != nil { + return '\x00', err + } if b.pos >= len(b.buf) { - return '\n' + return '\n', nil } } c := b.buf[b.pos] b.pos++ - return c + return c, nil } func (b *buffer) reload() (bool, error) { @@ -128,16 +130,19 @@ func (b *buffer) readToken() token { } // Find first non-space, non-comment byte. - c := b.readByte() + c, err := b.readByte() for { + if err != nil { + return err + } if isSpace(c) { if b.eof { return io.EOF } - c = b.readByte() + c, err = b.readByte() } else if c == '%' { for c != '\r' && c != '\n' { - c = b.readByte() + c, err = b.readByte() } } else { break @@ -146,7 +151,7 @@ func (b *buffer) readToken() token { switch c { case '<': - if b.readByte() == '<' { + if b, _ := b.readByte(); b == '<' { return keyword("<<") } b.unreadByte() @@ -162,7 +167,7 @@ func (b *buffer) readToken() token { return b.readName() case '>': - if b.readByte() == '>' { + if b, _ := b.readByte(); b == '>' { return keyword(">>") } b.unreadByte() @@ -181,7 +186,10 @@ func (b *buffer) readHexString() token { tmp := b.tmp[:0] for { Loop: - c := b.readByte() + c, err := b.readByte() + if err != nil { + return err + } if c == '>' { break } @@ -189,7 +197,10 @@ func (b *buffer) readHexString() token { goto Loop } Loop2: - c2 := b.readByte() + c2, err := b.readByte() + if err != nil { + return err + } if isSpace(c2) { goto Loop2 } @@ -220,7 +231,10 @@ func (b *buffer) readLiteralString() token { depth := 1 Loop: for { - c := b.readByte() + c, err := b.readByte() + if err != nil { + return err + } switch c { default: tmp = append(tmp, c) @@ -233,7 +247,10 @@ Loop: } tmp = append(tmp, c) case '\\': - switch c = b.readByte(); c { + if c, err = b.readByte(); err != nil { + return err + } + switch c { default: return errors.Errorf("invalid escape sequence \\%c", c) case 'n': @@ -249,7 +266,7 @@ Loop: case '(', ')', '\\': tmp = append(tmp, c) case '\r': - if b.readByte() != '\n' { + if c, _ := b.readByte(); c != '\n' { b.unreadByte() } fallthrough @@ -258,7 +275,10 @@ Loop: case '0', '1', '2', '3', '4', '5', '6', '7': x := int(c - '0') for i := 0; i < 2; i++ { - c = b.readByte() + c, err = b.readByte() + if err != nil { + return err + } if c < '0' || c > '7' { b.unreadByte() break @@ -279,13 +299,24 @@ Loop: func (b *buffer) readName() token { tmp := b.tmp[:0] for { - c := b.readByte() + c, err := b.readByte() + if err != nil { + return err + } if isDelim(c) || isSpace(c) { b.unreadByte() break } if c == '#' { - x := unhex(b.readByte())<<4 | unhex(b.readByte()) + hi, err := b.readByte() + if err != nil { + return err + } + lo, err := b.readByte() + if err != nil { + return err + } + x := unhex(hi)<<4 | unhex(lo) if x < 0 { return errors.Errorf("malformed name") } @@ -301,7 +332,10 @@ func (b *buffer) readName() token { func (b *buffer) readKeyword() token { tmp := b.tmp[:0] for { - c := b.readByte() + c, err := b.readByte() + if err != nil { + return err + } if isDelim(c) || isSpace(c) { b.unreadByte() break @@ -492,9 +526,13 @@ func (b *buffer) readDict() object { return x } - switch b.readByte() { + c, err := b.readByte() + if err != nil { + return err + } + switch c { case '\r': - if b.readByte() != '\n' { + if x, _ := b.readByte(); x != '\n' { b.unreadByte() } case '\n':