parser/pageparser: Split the page lexer into some more files

author Bjørn Erik Pedersen <bjorn.erik.pedersen@gmail.com>

Mon, 17 Dec 2018 19:54:06 +0000 (20:54 +0100)

committer Bjørn Erik Pedersen <bjorn.erik.pedersen@gmail.com>

Thu, 20 Dec 2018 19:08:01 +0000 (20:08 +0100)
author Bjørn Erik Pedersen <bjorn.erik.pedersen@gmail.com>
Mon, 17 Dec 2018 19:54:06 +0000 (20:54 +0100)
committer Bjørn Erik Pedersen <bjorn.erik.pedersen@gmail.com>
Thu, 20 Dec 2018 19:08:01 +0000 (20:08 +0100)
diff --git a/parser/pageparser/pagelexer.go b/parser/pageparser/pagelexer.go

index 5802c318bb95e841ddbc17aff20f94fd5a890ed7..d11e88403c8367e21fda2b5b0450c42d3a3f1d8c 100644 (file)
--- a/parser/pageparser/pagelexer.go
+++ b/parser/pageparser/pagelexer.go
@@ -29,18 +29,6 @@ const eof = -1
  // returns the next state in scanner.
  type stateFunc func(*pageLexer) stateFunc
  
-type lexerShortcodeState struct {
-       currLeftDelimItem  ItemType
-       currRightDelimItem ItemType
-       isInline           bool
-       currShortcodeName  string          // is only set when a shortcode is in opened state
-       closingState       int             // > 0 = on its way to be closed
-       elementStepNum     int             // step number in element
-       paramElements      int             // number of elements (name + value = 2) found first
-       openShortcodes     map[string]bool // set of shortcodes in open state
-
-}
-
  type pageLexer struct {
         input      []byte
         stateStart stateFunc
@@ -102,17 +90,6 @@ func (l *pageLexer) run() *pageLexer {
         return l
  }
  
-// Shortcode syntax
-var (
-       leftDelimSc            = []byte("{{")
-       leftDelimScNoMarkup    = []byte("{{<")
-       rightDelimScNoMarkup   = []byte(">}}")
-       leftDelimScWithMarkup  = []byte("{{%")
-       rightDelimScWithMarkup = []byte("%}}")
-       leftComment            = []byte("/*") // comments in this context us used to to mark shortcodes as "not really a shortcode"
-       rightComment           = []byte("*/")
-)
-
  // Page syntax
  var (
         byteOrderMark     = '\ufeff'
@@ -293,11 +270,6 @@ func lexMainSection(l *pageLexer) stateFunc {
  
  }
  
-func (l *pageLexer) isShortCodeStart() bool {
-       return l.hasPrefix(leftDelimScWithMarkup) || l.hasPrefix(leftDelimScNoMarkup)
-
-}
-
  func (l *pageLexer) posFirstNonWhiteSpace() int {
         f := func(c rune) bool {
                 return !unicode.IsSpace(c)
@@ -305,69 +277,6 @@ func (l *pageLexer) posFirstNonWhiteSpace() int {
         return bytes.IndexFunc(l.input[l.pos:], f)
  }
  
-func lexIntroSection(l *pageLexer) stateFunc {
-       l.summaryDivider = summaryDivider
-
-LOOP:
-       for {
-               r := l.next()
-               if r == eof {
-                       break
-               }
-
-               switch {
-               case r == '+':
-                       return l.lexFrontMatterSection(TypeFrontMatterTOML, r, "TOML", delimTOML)
-               case r == '-':
-                       return l.lexFrontMatterSection(TypeFrontMatterYAML, r, "YAML", delimYAML)
-               case r == '{':
-                       return lexFrontMatterJSON
-               case r == '#':
-                       return lexFrontMatterOrgMode
-               case r == byteOrderMark:
-                       l.emit(TypeIgnore)
-               case !isSpace(r) && !isEndOfLine(r):
-                       if r == '<' {
-                               l.backup()
-                               if l.hasPrefix(htmlCommentStart) {
-                                       // This may be commented out front mattter, which should
-                                       // still be read.
-                                       l.consumeToNextLine()
-                                       l.isInHTMLComment = true
-                                       l.emit(TypeIgnore)
-                                       continue LOOP
-                               } else {
-                                       if l.pos > l.start {
-                                               l.emit(tText)
-                                       }
-                                       l.next()
-                                       // This is the start of a plain HTML document with no
-                                       // front matter. I still can contain shortcodes, so we
-                                       // have to keep looking.
-                                       l.emit(TypeHTMLStart)
-                               }
-                       }
-                       break LOOP
-               }
-       }
-
-       // Now move on to the shortcodes.
-       return lexMainSection
-}
-
-func lexEndFromtMatterHTMLComment(l *pageLexer) stateFunc {
-       l.isInHTMLComment = false
-       right := l.index(htmlCommentEnd)
-       if right == -1 {
-               return l.errorf("starting HTML comment with no end")
-       }
-       l.pos += right + len(htmlCommentEnd)
-       l.emit(TypeIgnore)
-
-       // Now move on to the shortcodes.
-       return lexMainSection
-}
-
  func lexDone(l *pageLexer) stateFunc {
  
         // Done!
@@ -378,385 +287,10 @@ func lexDone(l *pageLexer) stateFunc {
         return nil
  }
  
-func lexFrontMatterJSON(l *pageLexer) stateFunc {
-       // Include the left delimiter
-       l.backup()
-
-       var (
-               inQuote bool
-               level   int
-       )
-
-       for {
-
-               r := l.next()
-
-               switch {
-               case r == eof:
-                       return l.errorf("unexpected EOF parsing JSON front matter")
-               case r == '{':
-                       if !inQuote {
-                               level++
-                       }
-               case r == '}':
-                       if !inQuote {
-                               level--
-                       }
-               case r == '"':
-                       inQuote = !inQuote
-               case r == '\\':
-                       // This may be an escaped quote. Make sure it's not marked as a
-                       // real one.
-                       l.next()
-               }
-
-               if level == 0 {
-                       break
-               }
-       }
-
-       l.consumeCRLF()
-       l.emit(TypeFrontMatterJSON)
-
-       return lexMainSection
-}
-
-func lexFrontMatterOrgMode(l *pageLexer) stateFunc {
-       /*
-               #+TITLE: Test File For chaseadamsio/goorgeous
-               #+AUTHOR: Chase Adams
-               #+DESCRIPTION: Just another golang parser for org content!
-       */
-
-       l.summaryDivider = summaryDividerOrg
-
-       l.backup()
-
-       if !l.hasPrefix(delimOrg) {
-               return lexMainSection
-       }
-
-       // Read lines until we no longer see a #+ prefix
-LOOP:
-       for {
-
-               r := l.next()
-
-               switch {
-               case r == '\n':
-                       if !l.hasPrefix(delimOrg) {
-                               break LOOP
-                       }
-               case r == eof:
-                       break LOOP
-
-               }
-       }
-
-       l.emit(TypeFrontMatterORG)
-
-       return lexMainSection
-
-}
-
  func (l *pageLexer) printCurrentInput() {
         fmt.Printf("input[%d:]: %q", l.pos, string(l.input[l.pos:]))
  }
  
-// Handle YAML or TOML front matter.
-func (l *pageLexer) lexFrontMatterSection(tp ItemType, delimr rune, name string, delim []byte) stateFunc {
-
-       for i := 0; i < 2; i++ {
-               if r := l.next(); r != delimr {
-                       return l.errorf("invalid %s delimiter", name)
-               }
-       }
-
-       // Let front matter start at line 1
-       wasEndOfLine := l.consumeCRLF()
-       // We don't care about the delimiters.
-       l.ignore()
-
-       var r rune
-
-       for {
-               if !wasEndOfLine {
-                       r = l.next()
-                       if r == eof {
-                               return l.errorf("EOF looking for end %s front matter delimiter", name)
-                       }
-               }
-
-               if wasEndOfLine || isEndOfLine(r) {
-                       if l.hasPrefix(delim) {
-                               l.emit(tp)
-                               l.pos += 3
-                               l.consumeCRLF()
-                               l.ignore()
-                               break
-                       }
-               }
-
-               wasEndOfLine = false
-       }
-
-       return lexMainSection
-}
-
-func lexShortcodeLeftDelim(l *pageLexer) stateFunc {
-       l.pos += len(l.currentLeftShortcodeDelim())
-       if l.hasPrefix(leftComment) {
-               return lexShortcodeComment
-       }
-       l.emit(l.currentLeftShortcodeDelimItem())
-       l.elementStepNum = 0
-       l.paramElements = 0
-       return lexInsideShortcode
-}
-
-func lexShortcodeComment(l *pageLexer) stateFunc {
-       posRightComment := l.index(append(rightComment, l.currentRightShortcodeDelim()...))
-       if posRightComment <= 1 {
-               return l.errorf("comment must be closed")
-       }
-       // we emit all as text, except the comment markers
-       l.emit(tText)
-       l.pos += len(leftComment)
-       l.ignore()
-       l.pos += posRightComment - len(leftComment)
-       l.emit(tText)
-       l.pos += len(rightComment)
-       l.ignore()
-       l.pos += len(l.currentRightShortcodeDelim())
-       l.emit(tText)
-       return lexMainSection
-}
-
-func lexShortcodeRightDelim(l *pageLexer) stateFunc {
-       l.closingState = 0
-       l.pos += len(l.currentRightShortcodeDelim())
-       l.emit(l.currentRightShortcodeDelimItem())
-       return lexMainSection
-}
-
-// either:
-// 1. param
-// 2. "param" or "param\"
-// 3. param="123" or param="123\"
-// 4. param="Some \"escaped\" text"
-func lexShortcodeParam(l *pageLexer, escapedQuoteStart bool) stateFunc {
-
-       first := true
-       nextEq := false
-
-       var r rune
-
-       for {
-               r = l.next()
-               if first {
-                       if r == '"' {
-                               // a positional param with quotes
-                               if l.paramElements == 2 {
-                                       return l.errorf("got quoted positional parameter. Cannot mix named and positional parameters")
-                               }
-                               l.paramElements = 1
-                               l.backup()
-                               return lexShortcodeQuotedParamVal(l, !escapedQuoteStart, tScParam)
-                       }
-                       first = false
-               } else if r == '=' {
-                       // a named param
-                       l.backup()
-                       nextEq = true
-                       break
-               }
-
-               if !isAlphaNumericOrHyphen(r) {
-                       l.backup()
-                       break
-               }
-       }
-
-       if l.paramElements == 0 {
-               l.paramElements++
-
-               if nextEq {
-                       l.paramElements++
-               }
-       } else {
-               if nextEq && l.paramElements == 1 {
-                       return l.errorf("got named parameter '%s'. Cannot mix named and positional parameters", l.current())
-               } else if !nextEq && l.paramElements == 2 {
-                       return l.errorf("got positional parameter '%s'. Cannot mix named and positional parameters", l.current())
-               }
-       }
-
-       l.emit(tScParam)
-       return lexInsideShortcode
-
-}
-
-func lexShortcodeQuotedParamVal(l *pageLexer, escapedQuotedValuesAllowed bool, typ ItemType) stateFunc {
-       openQuoteFound := false
-       escapedInnerQuoteFound := false
-       escapedQuoteState := 0
-
-Loop:
-       for {
-               switch r := l.next(); {
-               case r == '\\':
-                       if l.peek() == '"' {
-                               if openQuoteFound && !escapedQuotedValuesAllowed {
-                                       l.backup()
-                                       break Loop
-                               } else if openQuoteFound {
-                                       // the coming quoute is inside
-                                       escapedInnerQuoteFound = true
-                                       escapedQuoteState = 1
-                               }
-                       }
-               case r == eof, r == '\n':
-                       return l.errorf("unterminated quoted string in shortcode parameter-argument: '%s'", l.current())
-               case r == '"':
-                       if escapedQuoteState == 0 {
-                               if openQuoteFound {
-                                       l.backup()
-                                       break Loop
-
-                               } else {
-                                       openQuoteFound = true
-                                       l.ignore()
-                               }
-                       } else {
-                               escapedQuoteState = 0
-                       }
-
-               }
-       }
-
-       if escapedInnerQuoteFound {
-               l.ignoreEscapesAndEmit(typ)
-       } else {
-               l.emit(typ)
-       }
-
-       r := l.next()
-
-       if r == '\\' {
-               if l.peek() == '"' {
-                       // ignore the escaped closing quote
-                       l.ignore()
-                       l.next()
-                       l.ignore()
-               }
-       } else if r == '"' {
-               // ignore closing quote
-               l.ignore()
-       } else {
-               // handled by next state
-               l.backup()
-       }
-
-       return lexInsideShortcode
-}
-
-// Inline shortcodes has the form {{< myshortcode.inline >}}
-var inlineIdentifier = []byte("inline ")
-
-// scans an alphanumeric inside shortcode
-func lexIdentifierInShortcode(l *pageLexer) stateFunc {
-       lookForEnd := false
-Loop:
-       for {
-               switch r := l.next(); {
-               case isAlphaNumericOrHyphen(r):
-               // Allow forward slash inside names to make it possible to create namespaces.
-               case r == '/':
-               case r == '.':
-                       l.isInline = l.hasPrefix(inlineIdentifier)
-                       if !l.isInline {
-                               return l.errorf("period in shortcode name only allowed for inline identifiers")
-                       }
-               default:
-                       l.backup()
-                       word := string(l.input[l.start:l.pos])
-                       if l.closingState > 0 && !l.openShortcodes[word] {
-                               return l.errorf("closing tag for shortcode '%s' does not match start tag", word)
-                       } else if l.closingState > 0 {
-                               l.openShortcodes[word] = false
-                               lookForEnd = true
-                       }
-
-                       l.closingState = 0
-                       l.currShortcodeName = word
-                       l.openShortcodes[word] = true
-                       l.elementStepNum++
-                       if l.isInline {
-                               l.emit(tScNameInline)
-                       } else {
-                               l.emit(tScName)
-                       }
-                       break Loop
-               }
-       }
-
-       if lookForEnd {
-               return lexEndOfShortcode
-       }
-       return lexInsideShortcode
-}
-
-func lexEndOfShortcode(l *pageLexer) stateFunc {
-       l.isInline = false
-       if l.hasPrefix(l.currentRightShortcodeDelim()) {
-               return lexShortcodeRightDelim
-       }
-       switch r := l.next(); {
-       case isSpace(r):
-               l.ignore()
-       default:
-               return l.errorf("unclosed shortcode")
-       }
-       return lexEndOfShortcode
-}
-
-// scans the elements inside shortcode tags
-func lexInsideShortcode(l *pageLexer) stateFunc {
-       if l.hasPrefix(l.currentRightShortcodeDelim()) {
-               return lexShortcodeRightDelim
-       }
-       switch r := l.next(); {
-       case r == eof:
-               // eol is allowed inside shortcodes; this may go to end of document before it fails
-               return l.errorf("unclosed shortcode action")
-       case isSpace(r), isEndOfLine(r):
-               l.ignore()
-       case r == '=':
-               l.ignore()
-               return lexShortcodeQuotedParamVal(l, l.peek() != '\\', tScParamVal)
-       case r == '/':
-               if l.currShortcodeName == "" {
-                       return l.errorf("got closing shortcode, but none is open")
-               }
-               l.closingState++
-               l.emit(tScClose)
-       case r == '\\':
-               l.ignore()
-               if l.peek() == '"' {
-                       return lexShortcodeParam(l, true)
-               }
-       case l.elementStepNum > 0 && (isAlphaNumericOrHyphen(r) || r == '"'): // positional params can have quotes
-               l.backup()
-               return lexShortcodeParam(l, false)
-       case isAlphaNumeric(r):
-               l.backup()
-               return lexIdentifierInShortcode
-       default:
-               return l.errorf("unrecognized character in shortcode action: %#U. Note: Parameters with non-alphanumeric args must be quoted", r)
-       }
-       return lexInsideShortcode
-}
-
  // state helpers
  
  func (l *pageLexer) index(sep []byte) int {
@@ -767,29 +301,6 @@ func (l *pageLexer) hasPrefix(prefix []byte) bool {
         return bytes.HasPrefix(l.input[l.pos:], prefix)
  }
  
-func (l *pageLexer) currentLeftShortcodeDelimItem() ItemType {
-       return l.currLeftDelimItem
-}
-
-func (l *pageLexer) currentRightShortcodeDelimItem() ItemType {
-       return l.currRightDelimItem
-}
-
-func (l *pageLexer) currentLeftShortcodeDelim() []byte {
-       if l.currLeftDelimItem == tLeftDelimScWithMarkup {
-               return leftDelimScWithMarkup
-       }
-       return leftDelimScNoMarkup
-
-}
-
-func (l *pageLexer) currentRightShortcodeDelim() []byte {
-       if l.currRightDelimItem == tRightDelimScWithMarkup {
-               return rightDelimScWithMarkup
-       }
-       return rightDelimScNoMarkup
-}
-
  // helper functions
  
  // returns the min index >= 0
diff --git a/parser/pageparser/pagelexer_intro.go b/parser/pageparser/pagelexer_intro.go

new file mode 100644 (file)

index 0000000..56dd422
--- /dev/null
+++ b/parser/pageparser/pagelexer_intro.go
@@ -0,0 +1,202 @@
+// Copyright 2018 The Hugo Authors. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package pageparser provides a parser for Hugo content files (Markdown, HTML etc.) in Hugo.
+// This implementation is highly inspired by the great talk given by Rob Pike called "Lexical Scanning in Go"
+// It's on YouTube, Google it!.
+// See slides here: http://cuddle.googlecode.com/hg/talk/lex.html
+package pageparser
+
+func lexIntroSection(l *pageLexer) stateFunc {
+       l.summaryDivider = summaryDivider
+
+LOOP:
+       for {
+               r := l.next()
+               if r == eof {
+                       break
+               }
+
+               switch {
+               case r == '+':
+                       return l.lexFrontMatterSection(TypeFrontMatterTOML, r, "TOML", delimTOML)
+               case r == '-':
+                       return l.lexFrontMatterSection(TypeFrontMatterYAML, r, "YAML", delimYAML)
+               case r == '{':
+                       return lexFrontMatterJSON
+               case r == '#':
+                       return lexFrontMatterOrgMode
+               case r == byteOrderMark:
+                       l.emit(TypeIgnore)
+               case !isSpace(r) && !isEndOfLine(r):
+                       if r == '<' {
+                               l.backup()
+                               if l.hasPrefix(htmlCommentStart) {
+                                       // This may be commented out front mattter, which should
+                                       // still be read.
+                                       l.consumeToNextLine()
+                                       l.isInHTMLComment = true
+                                       l.emit(TypeIgnore)
+                                       continue LOOP
+                               } else {
+                                       if l.pos > l.start {
+                                               l.emit(tText)
+                                       }
+                                       l.next()
+                                       // This is the start of a plain HTML document with no
+                                       // front matter. I still can contain shortcodes, so we
+                                       // have to keep looking.
+                                       l.emit(TypeHTMLStart)
+                               }
+                       }
+                       break LOOP
+               }
+       }
+
+       // Now move on to the shortcodes.
+       return lexMainSection
+}
+
+func lexEndFromtMatterHTMLComment(l *pageLexer) stateFunc {
+       l.isInHTMLComment = false
+       right := l.index(htmlCommentEnd)
+       if right == -1 {
+               return l.errorf("starting HTML comment with no end")
+       }
+       l.pos += right + len(htmlCommentEnd)
+       l.emit(TypeIgnore)
+
+       // Now move on to the shortcodes.
+       return lexMainSection
+}
+
+func lexFrontMatterJSON(l *pageLexer) stateFunc {
+       // Include the left delimiter
+       l.backup()
+
+       var (
+               inQuote bool
+               level   int
+       )
+
+       for {
+
+               r := l.next()
+
+               switch {
+               case r == eof:
+                       return l.errorf("unexpected EOF parsing JSON front matter")
+               case r == '{':
+                       if !inQuote {
+                               level++
+                       }
+               case r == '}':
+                       if !inQuote {
+                               level--
+                       }
+               case r == '"':
+                       inQuote = !inQuote
+               case r == '\\':
+                       // This may be an escaped quote. Make sure it's not marked as a
+                       // real one.
+                       l.next()
+               }
+
+               if level == 0 {
+                       break
+               }
+       }
+
+       l.consumeCRLF()
+       l.emit(TypeFrontMatterJSON)
+
+       return lexMainSection
+}
+
+func lexFrontMatterOrgMode(l *pageLexer) stateFunc {
+       /*
+               #+TITLE: Test File For chaseadamsio/goorgeous
+               #+AUTHOR: Chase Adams
+               #+DESCRIPTION: Just another golang parser for org content!
+       */
+
+       l.summaryDivider = summaryDividerOrg
+
+       l.backup()
+
+       if !l.hasPrefix(delimOrg) {
+               return lexMainSection
+       }
+
+       // Read lines until we no longer see a #+ prefix
+LOOP:
+       for {
+
+               r := l.next()
+
+               switch {
+               case r == '\n':
+                       if !l.hasPrefix(delimOrg) {
+                               break LOOP
+                       }
+               case r == eof:
+                       break LOOP
+
+               }
+       }
+
+       l.emit(TypeFrontMatterORG)
+
+       return lexMainSection
+
+}
+
+// Handle YAML or TOML front matter.
+func (l *pageLexer) lexFrontMatterSection(tp ItemType, delimr rune, name string, delim []byte) stateFunc {
+
+       for i := 0; i < 2; i++ {
+               if r := l.next(); r != delimr {
+                       return l.errorf("invalid %s delimiter", name)
+               }
+       }
+
+       // Let front matter start at line 1
+       wasEndOfLine := l.consumeCRLF()
+       // We don't care about the delimiters.
+       l.ignore()
+
+       var r rune
+
+       for {
+               if !wasEndOfLine {
+                       r = l.next()
+                       if r == eof {
+                               return l.errorf("EOF looking for end %s front matter delimiter", name)
+                       }
+               }
+
+               if wasEndOfLine || isEndOfLine(r) {
+                       if l.hasPrefix(delim) {
+                               l.emit(tp)
+                               l.pos += 3
+                               l.consumeCRLF()
+                               l.ignore()
+                               break
+                       }
+               }
+
+               wasEndOfLine = false
+       }
+
+       return lexMainSection
+}
diff --git a/parser/pageparser/pagelexer_shortcode.go b/parser/pageparser/pagelexer_shortcode.go

new file mode 100644 (file)

index 0000000..fe18245
--- /dev/null
+++ b/parser/pageparser/pagelexer_shortcode.go
@@ -0,0 +1,322 @@
+// Copyright 2018 The Hugo Authors. All rights reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package pageparser provides a parser for Hugo content files (Markdown, HTML etc.) in Hugo.
+// This implementation is highly inspired by the great talk given by Rob Pike called "Lexical Scanning in Go"
+// It's on YouTube, Google it!.
+// See slides here: http://cuddle.googlecode.com/hg/talk/lex.html
+package pageparser
+
+type lexerShortcodeState struct {
+       currLeftDelimItem  ItemType
+       currRightDelimItem ItemType
+       isInline           bool
+       currShortcodeName  string          // is only set when a shortcode is in opened state
+       closingState       int             // > 0 = on its way to be closed
+       elementStepNum     int             // step number in element
+       paramElements      int             // number of elements (name + value = 2) found first
+       openShortcodes     map[string]bool // set of shortcodes in open state
+
+}
+
+// Shortcode syntax
+var (
+       leftDelimSc            = []byte("{{")
+       leftDelimScNoMarkup    = []byte("{{<")
+       rightDelimScNoMarkup   = []byte(">}}")
+       leftDelimScWithMarkup  = []byte("{{%")
+       rightDelimScWithMarkup = []byte("%}}")
+       leftComment            = []byte("/*") // comments in this context us used to to mark shortcodes as "not really a shortcode"
+       rightComment           = []byte("*/")
+)
+
+func (l *pageLexer) isShortCodeStart() bool {
+       return l.hasPrefix(leftDelimScWithMarkup) || l.hasPrefix(leftDelimScNoMarkup)
+}
+
+func lexShortcodeLeftDelim(l *pageLexer) stateFunc {
+       l.pos += len(l.currentLeftShortcodeDelim())
+       if l.hasPrefix(leftComment) {
+               return lexShortcodeComment
+       }
+       l.emit(l.currentLeftShortcodeDelimItem())
+       l.elementStepNum = 0
+       l.paramElements = 0
+       return lexInsideShortcode
+}
+
+func lexShortcodeComment(l *pageLexer) stateFunc {
+       posRightComment := l.index(append(rightComment, l.currentRightShortcodeDelim()...))
+       if posRightComment <= 1 {
+               return l.errorf("comment must be closed")
+       }
+       // we emit all as text, except the comment markers
+       l.emit(tText)
+       l.pos += len(leftComment)
+       l.ignore()
+       l.pos += posRightComment - len(leftComment)
+       l.emit(tText)
+       l.pos += len(rightComment)
+       l.ignore()
+       l.pos += len(l.currentRightShortcodeDelim())
+       l.emit(tText)
+       return lexMainSection
+}
+
+func lexShortcodeRightDelim(l *pageLexer) stateFunc {
+       l.closingState = 0
+       l.pos += len(l.currentRightShortcodeDelim())
+       l.emit(l.currentRightShortcodeDelimItem())
+       return lexMainSection
+}
+
+// either:
+// 1. param
+// 2. "param" or "param\"
+// 3. param="123" or param="123\"
+// 4. param="Some \"escaped\" text"
+func lexShortcodeParam(l *pageLexer, escapedQuoteStart bool) stateFunc {
+
+       first := true
+       nextEq := false
+
+       var r rune
+
+       for {
+               r = l.next()
+               if first {
+                       if r == '"' {
+                               // a positional param with quotes
+                               if l.paramElements == 2 {
+                                       return l.errorf("got quoted positional parameter. Cannot mix named and positional parameters")
+                               }
+                               l.paramElements = 1
+                               l.backup()
+                               return lexShortcodeQuotedParamVal(l, !escapedQuoteStart, tScParam)
+                       }
+                       first = false
+               } else if r == '=' {
+                       // a named param
+                       l.backup()
+                       nextEq = true
+                       break
+               }
+
+               if !isAlphaNumericOrHyphen(r) {
+                       l.backup()
+                       break
+               }
+       }
+
+       if l.paramElements == 0 {
+               l.paramElements++
+
+               if nextEq {
+                       l.paramElements++
+               }
+       } else {
+               if nextEq && l.paramElements == 1 {
+                       return l.errorf("got named parameter '%s'. Cannot mix named and positional parameters", l.current())
+               } else if !nextEq && l.paramElements == 2 {
+                       return l.errorf("got positional parameter '%s'. Cannot mix named and positional parameters", l.current())
+               }
+       }
+
+       l.emit(tScParam)
+       return lexInsideShortcode
+
+}
+
+func lexShortcodeQuotedParamVal(l *pageLexer, escapedQuotedValuesAllowed bool, typ ItemType) stateFunc {
+       openQuoteFound := false
+       escapedInnerQuoteFound := false
+       escapedQuoteState := 0
+
+Loop:
+       for {
+               switch r := l.next(); {
+               case r == '\\':
+                       if l.peek() == '"' {
+                               if openQuoteFound && !escapedQuotedValuesAllowed {
+                                       l.backup()
+                                       break Loop
+                               } else if openQuoteFound {
+                                       // the coming quoute is inside
+                                       escapedInnerQuoteFound = true
+                                       escapedQuoteState = 1
+                               }
+                       }
+               case r == eof, r == '\n':
+                       return l.errorf("unterminated quoted string in shortcode parameter-argument: '%s'", l.current())
+               case r == '"':
+                       if escapedQuoteState == 0 {
+                               if openQuoteFound {
+                                       l.backup()
+                                       break Loop
+
+                               } else {
+                                       openQuoteFound = true
+                                       l.ignore()
+                               }
+                       } else {
+                               escapedQuoteState = 0
+                       }
+
+               }
+       }
+
+       if escapedInnerQuoteFound {
+               l.ignoreEscapesAndEmit(typ)
+       } else {
+               l.emit(typ)
+       }
+
+       r := l.next()
+
+       if r == '\\' {
+               if l.peek() == '"' {
+                       // ignore the escaped closing quote
+                       l.ignore()
+                       l.next()
+                       l.ignore()
+               }
+       } else if r == '"' {
+               // ignore closing quote
+               l.ignore()
+       } else {
+               // handled by next state
+               l.backup()
+       }
+
+       return lexInsideShortcode
+}
+
+// Inline shortcodes has the form {{< myshortcode.inline >}}
+var inlineIdentifier = []byte("inline ")
+
+// scans an alphanumeric inside shortcode
+func lexIdentifierInShortcode(l *pageLexer) stateFunc {
+       lookForEnd := false
+Loop:
+       for {
+               switch r := l.next(); {
+               case isAlphaNumericOrHyphen(r):
+               // Allow forward slash inside names to make it possible to create namespaces.
+               case r == '/':
+               case r == '.':
+                       l.isInline = l.hasPrefix(inlineIdentifier)
+                       if !l.isInline {
+                               return l.errorf("period in shortcode name only allowed for inline identifiers")
+                       }
+               default:
+                       l.backup()
+                       word := string(l.input[l.start:l.pos])
+                       if l.closingState > 0 && !l.openShortcodes[word] {
+                               return l.errorf("closing tag for shortcode '%s' does not match start tag", word)
+                       } else if l.closingState > 0 {
+                               l.openShortcodes[word] = false
+                               lookForEnd = true
+                       }
+
+                       l.closingState = 0
+                       l.currShortcodeName = word
+                       l.openShortcodes[word] = true
+                       l.elementStepNum++
+                       if l.isInline {
+                               l.emit(tScNameInline)
+                       } else {
+                               l.emit(tScName)
+                       }
+                       break Loop
+               }
+       }
+
+       if lookForEnd {
+               return lexEndOfShortcode
+       }
+       return lexInsideShortcode
+}
+
+func lexEndOfShortcode(l *pageLexer) stateFunc {
+       l.isInline = false
+       if l.hasPrefix(l.currentRightShortcodeDelim()) {
+               return lexShortcodeRightDelim
+       }
+       switch r := l.next(); {
+       case isSpace(r):
+               l.ignore()
+       default:
+               return l.errorf("unclosed shortcode")
+       }
+       return lexEndOfShortcode
+}
+
+// scans the elements inside shortcode tags
+func lexInsideShortcode(l *pageLexer) stateFunc {
+       if l.hasPrefix(l.currentRightShortcodeDelim()) {
+               return lexShortcodeRightDelim
+       }
+       switch r := l.next(); {
+       case r == eof:
+               // eol is allowed inside shortcodes; this may go to end of document before it fails
+               return l.errorf("unclosed shortcode action")
+       case isSpace(r), isEndOfLine(r):
+               l.ignore()
+       case r == '=':
+               l.ignore()
+               return lexShortcodeQuotedParamVal(l, l.peek() != '\\', tScParamVal)
+       case r == '/':
+               if l.currShortcodeName == "" {
+                       return l.errorf("got closing shortcode, but none is open")
+               }
+               l.closingState++
+               l.emit(tScClose)
+       case r == '\\':
+               l.ignore()
+               if l.peek() == '"' {
+                       return lexShortcodeParam(l, true)
+               }
+       case l.elementStepNum > 0 && (isAlphaNumericOrHyphen(r) || r == '"'): // positional params can have quotes
+               l.backup()
+               return lexShortcodeParam(l, false)
+       case isAlphaNumeric(r):
+               l.backup()
+               return lexIdentifierInShortcode
+       default:
+               return l.errorf("unrecognized character in shortcode action: %#U. Note: Parameters with non-alphanumeric args must be quoted", r)
+       }
+       return lexInsideShortcode
+}
+
+func (l *pageLexer) currentLeftShortcodeDelimItem() ItemType {
+       return l.currLeftDelimItem
+}
+
+func (l *pageLexer) currentRightShortcodeDelimItem() ItemType {
+       return l.currRightDelimItem
+}
+
+func (l *pageLexer) currentLeftShortcodeDelim() []byte {
+       if l.currLeftDelimItem == tLeftDelimScWithMarkup {
+               return leftDelimScWithMarkup
+       }
+       return leftDelimScNoMarkup
+
+}
+
+func (l *pageLexer) currentRightShortcodeDelim() []byte {
+       if l.currRightDelimItem == tRightDelimScWithMarkup {
+               return rightDelimScWithMarkup
+       }
+       return rightDelimScNoMarkup
+}
author	Bjørn Erik Pedersen <bjorn.erik.pedersen@gmail.com>
	Mon, 17 Dec 2018 19:54:06 +0000 (20:54 +0100)
committer	Bjørn Erik Pedersen <bjorn.erik.pedersen@gmail.com>
	Thu, 20 Dec 2018 19:08:01 +0000 (20:08 +0100)
parser/pageparser/pagelexer.go		patch \| blob \| history
parser/pageparser/pagelexer_intro.go	[new file with mode: 0644]	patch \| blob
parser/pageparser/pagelexer_shortcode.go	[new file with mode: 0644]	patch \| blob