buildkit/frontend/dockerfile/parser/parser.go

// Package parser implements a parser and parse tree dumper for Dockerfiles.
package parser

import (
	"bufio"
	"bytes"
	"fmt"
	"io"
	"regexp"
	"strings"
	"unicode"

	"github.com/docker/docker/builder/dockerfile/command"
)

// Node is a structure used to represent a parse tree.
//
// In the node there are three fields, Value, Next, and Children. Value is the
// current token's string value. Next is always the next non-child token, and
// children contains all the children. Here's an example:
//
// (value next (child child-next child-next-next) next-next)
//
// This data structure is frankly pretty lousy for handling complex languages,
// but lucky for us the Dockerfile isn't very complicated. This structure
// works a little more effectively than a "proper" parse tree for our needs.
//
type Node struct {
	Value      string          // actual content
	Next       *Node           // the next item in the current sexp
	Children   []*Node         // the children of this sexp
	Attributes map[string]bool // special attributes for this node
	Original   string          // original line used before parsing
	Flags      []string        // only top Node should have this set
	StartLine  int             // the line in the original dockerfile where the node begins
	EndLine    int             // the line in the original dockerfile where the node ends
}

// Directive is the structure used during a build run to hold the state of
// parsing directives.
type Directive struct {
	EscapeToken           rune           // Current escape token
	LineContinuationRegex *regexp.Regexp // Current line contination regex
	LookingForDirectives  bool           // Whether we are currently looking for directives
	EscapeSeen            bool           // Whether the escape directive has been seen
}

var (
	dispatch           map[string]func(string, *Directive) (*Node, map[string]bool, error)
	tokenWhitespace    = regexp.MustCompile(`[\t\v\f\r ]+`)
	tokenEscapeCommand = regexp.MustCompile(`^#[ \t]*escape[ \t]*=[ \t]*(?P<escapechar>.).*$`)
	tokenComment       = regexp.MustCompile(`^#.*$`)
)

// DefaultEscapeToken is the default escape token
const DefaultEscapeToken = "\\"

// SetEscapeToken sets the default token for escaping characters in a Dockerfile.
func SetEscapeToken(s string, d *Directive) error {
	if s != "`" && s != "\\" {
		return fmt.Errorf("invalid ESCAPE '%s'. Must be ` or \\", s)
	}
	d.EscapeToken = rune(s[0])
	d.LineContinuationRegex = regexp.MustCompile(`\` + s + `[ \t]*$`)
	return nil
}

func init() {
	// Dispatch Table. see line_parsers.go for the parse functions.
	// The command is parsed and mapped to the line parser. The line parser
	// receives the arguments but not the command, and returns an AST after
	// reformulating the arguments according to the rules in the parser
	// functions. Errors are propagated up by Parse() and the resulting AST can
	// be incorporated directly into the existing AST as a next.
	dispatch = map[string]func(string, *Directive) (*Node, map[string]bool, error){
		command.Add:         parseMaybeJSONToList,
		command.Arg:         parseNameOrNameVal,
		command.Cmd:         parseMaybeJSON,
		command.Copy:        parseMaybeJSONToList,
		command.Entrypoint:  parseMaybeJSON,
		command.Env:         parseEnv,
		command.Expose:      parseStringsWhitespaceDelimited,
		command.From:        parseString,
		command.Healthcheck: parseHealthConfig,
		command.Label:       parseLabel,
		command.Maintainer:  parseString,
		command.Onbuild:     parseSubCommand,
		command.Run:         parseMaybeJSON,
		command.Shell:       parseMaybeJSON,
		command.StopSignal:  parseString,
		command.User:        parseString,
		command.Volume:      parseMaybeJSONToList,
		command.Workdir:     parseString,
	}
}

// ParseLine parses a line and returns the remainder.
func ParseLine(line string, d *Directive, ignoreCont bool) (string, *Node, error) {
	// Handle the parser directive '# escape=<char>. Parser directives must precede
	// any builder instruction or other comments, and cannot be repeated.
	if d.LookingForDirectives {
		tecMatch := tokenEscapeCommand.FindStringSubmatch(strings.ToLower(line))
		if len(tecMatch) > 0 {
			if d.EscapeSeen == true {
				return "", nil, fmt.Errorf("only one escape parser directive can be used")
			}
			for i, n := range tokenEscapeCommand.SubexpNames() {
				if n == "escapechar" {
					if err := SetEscapeToken(tecMatch[i], d); err != nil {
						return "", nil, err
					}
					d.EscapeSeen = true
					return "", nil, nil
				}
			}
		}
	}

	d.LookingForDirectives = false

	if line = stripComments(line); line == "" {
		return "", nil, nil
	}

	if !ignoreCont && d.LineContinuationRegex.MatchString(line) {
		line = d.LineContinuationRegex.ReplaceAllString(line, "")
		return line, nil, nil
	}

	cmd, flags, args, err := splitCommand(line)
	if err != nil {
		return "", nil, err
	}

	node := &Node{}
	node.Value = cmd

	sexp, attrs, err := fullDispatch(cmd, args, d)
	if err != nil {
		return "", nil, err
	}

	node.Next = sexp
	node.Attributes = attrs
	node.Original = line
	node.Flags = flags

	return "", node, nil
}

// Parse is the main parse routine.
// It handles an io.ReadWriteCloser and returns the root of the AST.
func Parse(rwc io.Reader, d *Directive) (*Node, error) {
	currentLine := 0
	root := &Node{}
	root.StartLine = -1
	scanner := bufio.NewScanner(rwc)

	utf8bom := []byte{0xEF, 0xBB, 0xBF}
	for scanner.Scan() {
		scannedBytes := scanner.Bytes()
		// We trim UTF8 BOM
		if currentLine == 0 {
			scannedBytes = bytes.TrimPrefix(scannedBytes, utf8bom)
		}
		scannedLine := strings.TrimLeftFunc(string(scannedBytes), unicode.IsSpace)
		currentLine++
		line, child, err := ParseLine(scannedLine, d, false)
		if err != nil {
			return nil, err
		}
		startLine := currentLine

		if line != "" && child == nil {
			for scanner.Scan() {
				newline := scanner.Text()
				currentLine++

				if stripComments(strings.TrimSpace(newline)) == "" {
					continue
				}

				line, child, err = ParseLine(line+newline, d, false)
				if err != nil {
					return nil, err
				}

				if child != nil {
					break
				}
			}
			if child == nil && line != "" {
				// When we call ParseLine we'll pass in 'true' for
				// the ignoreCont param if we're at the EOF. This will
				// prevent the func from returning immediately w/o
				// parsing the line thinking that there's more input
				// to come.

				_, child, err = ParseLine(line, d, scanner.Err() == nil)
				if err != nil {
					return nil, err
				}
			}
		}

		if child != nil {
			// Update the line information for the current child.
			child.StartLine = startLine
			child.EndLine = currentLine
			// Update the line information for the root. The starting line of the root is always the
			// starting line of the first child and the ending line is the ending line of the last child.
			if root.StartLine < 0 {
				root.StartLine = currentLine
			}
			root.EndLine = currentLine
			root.Children = append(root.Children, child)
		}
	}

	return root, nil
}