Fix diff skipping lines (#13157)

* Fix diff skipping lines Backport #13154 ParsePatch previously just skipped all lines that start with "+++ " or "--- " and makes no attempt to see these lines in context. This PR rewrites ParsePatch to pay attention to context and position within a patch, ensuring that --- and +++ are only skipped if appropriate. This PR also fixes several issues with incomplete files. Fix https://codeberg.org/Codeberg/Community/issues/308 Fix #13153 Signed-off-by: Andrew Thornton <art27@cantab.net> * Add testcase Signed-off-by: Andrew Thornton <art27@cantab.net> * fix comment * simplify error handling Signed-off-by: Andrew Thornton <art27@cantab.net> * never return io.EOF Signed-off-by: Andrew Thornton <art27@cantab.net>
2020-10-16 22:09:25 +01:00 · 2020-10-16 22:09:25 +01:00 · e6a82047ee
parent 99fb256411
commit e6a82047ee
2 changed files with 355 additions and 188 deletions
--- a/services/gitdiff/gitdiff.go
+++ b/services/gitdiff/gitdiff.go
@ -405,91 +405,251 @@ func (diff *Diff) NumFiles() int {

 const cmdDiffHead = "diff --git "

-// ParsePatch builds a Diff object from a io.Reader and some
-// parameters.
-// TODO: move this function to gogits/git-module
+// ParsePatch builds a Diff object from a io.Reader and some parameters.
 func ParsePatch(maxLines, maxLineCharacters, maxFiles int, reader io.Reader) (*Diff, error) {
-	var (
-		diff = &Diff{Files: make([]*DiffFile, 0)}
+	var curFile *DiffFile

-		curFile    = &DiffFile{}
-		curSection = &DiffSection{
-			Lines: make([]*DiffLine, 0, 10),
+	diff := &Diff{Files: make([]*DiffFile, 0)}
+
+	sb := strings.Builder{}
+
+	// OK let's set a reasonable buffer size.
+	// This should be let's say at least the size of maxLineCharacters or 4096 whichever is larger.
+	readerSize := maxLineCharacters
+	if readerSize < 4096 {
+		readerSize = 4096
 	}

-		leftLine, rightLine int
-		lineCount           int
+	input := bufio.NewReaderSize(reader, readerSize)
+	line, err := input.ReadString('\n')
+	if err != nil {
+		if err == io.EOF {
+			return diff, nil
+		}
+		return diff, err
+	}
+parsingLoop:
+	for {
+		// 1. A patch file always begins with `diff --git ` + `a/path b/path` (possibly quoted)
+		// if it does not we have bad input!
+		if !strings.HasPrefix(line, cmdDiffHead) {
+			return diff, fmt.Errorf("Invalid first file line: %s", line)
+		}
+
+		// TODO: Handle skipping first n files
+		if len(diff.Files) >= maxFiles {
+			diff.IsIncomplete = true
+			_, err := io.Copy(ioutil.Discard, reader)
+			if err != nil {
+				// By the definition of io.Copy this never returns io.EOF
+				return diff, fmt.Errorf("Copy: %v", err)
+			}
+			break parsingLoop
+		}
+
+		curFile = createDiffFile(diff, line)
+		diff.Files = append(diff.Files, curFile)
+
+		// 2. It is followed by one or more extended header lines:
+		//
+		//     old mode <mode>
+		//     new mode <mode>
+		//     deleted file mode <mode>
+		//     new file mode <mode>
+		//     copy from <path>
+		//     copy to <path>
+		//     rename from <path>
+		//     rename to <path>
+		//     similarity index <number>
+		//     dissimilarity index <number>
+		//     index <hash>..<hash> <mode>
+		//
+		// * <mode> 6-digit octal numbers including the file type and file permission bits.
+		// * <path> does not include the a/ and b/ prefixes
+		// * <number> percentage of unchanged lines for similarity, percentage of changed
+		//   lines dissimilarity as integer rounded down with terminal %. 100% => equal files.
+		// * The index line includes the blob object names before and after the change.
+		//   The <mode> is included if the file mode does not change; otherwise, separate
+		//   lines indicate the old and the new mode.
+		// 3. Following this header the "standard unified" diff format header may be encountered: (but not for every case...)
+		//
+		//     --- a/<path>
+		//     +++ b/<path>
+		//
+		// With multiple hunks
+		//
+		//     @@ <hunk descriptor> @@
+		//     +added line
+		//     -removed line
+		//      unchanged line
+		//
+		// 4. Binary files get:
+		//
+		//     Binary files a/<path> and b/<path> differ
+		//
+		// but one of a/<path> and b/<path> could be /dev/null.
+	curFileLoop:
+		for {
+			line, err = input.ReadString('\n')
+			if err != nil {
+				if err != io.EOF {
+					return diff, err
+				}
+				break parsingLoop
+			}
+			switch {
+			case strings.HasPrefix(line, "old mode ") ||
+				strings.HasPrefix(line, "new mode "):
+				if strings.HasSuffix(line, " 160000\n") {
+					curFile.IsSubmodule = true
+				}
+			case strings.HasPrefix(line, "copy from "):
+				curFile.IsRenamed = true
+				curFile.Type = DiffFileCopy
+			case strings.HasPrefix(line, "copy to "):
+				curFile.IsRenamed = true
+				curFile.Type = DiffFileCopy
+			case strings.HasPrefix(line, "new file"):
+				curFile.Type = DiffFileAdd
+				curFile.IsCreated = true
+				if strings.HasSuffix(line, " 160000\n") {
+					curFile.IsSubmodule = true
+				}
+			case strings.HasPrefix(line, "deleted"):
+				curFile.Type = DiffFileDel
+				curFile.IsDeleted = true
+				if strings.HasSuffix(line, " 160000\n") {
+					curFile.IsSubmodule = true
+				}
+			case strings.HasPrefix(line, "index"):
+				if strings.HasSuffix(line, " 160000\n") {
+					curFile.IsSubmodule = true
+				}
+			case strings.HasPrefix(line, "similarity index 100%"):
+				curFile.Type = DiffFileRename
+			case strings.HasPrefix(line, "Binary"):
+				curFile.IsBin = true
+			case strings.HasPrefix(line, "--- "):
+				// Do nothing with this line
+			case strings.HasPrefix(line, "+++ "):
+				// Do nothing with this line
+				lineBytes, isFragment, err := parseHunks(curFile, maxLines, maxLineCharacters, input)
+				diff.TotalAddition += curFile.Addition
+				diff.TotalDeletion += curFile.Deletion
+				if err != nil {
+					if err != io.EOF {
+						return diff, err
+					}
+					break parsingLoop
+				}
+				sb.Reset()
+				_, _ = sb.Write(lineBytes)
+				for isFragment {
+					lineBytes, isFragment, err = input.ReadLine()
+					if err != nil {
+						// Now by the definition of ReadLine this cannot be io.EOF
+						return diff, fmt.Errorf("Unable to ReadLine: %v", err)
+					}
+					_, _ = sb.Write(lineBytes)
+				}
+				line = sb.String()
+				sb.Reset()
+
+				break curFileLoop
+			}
+		}
+
+	}
+
+	// FIXME: There are numerous issues with this:
+	// - we might want to consider detecting encoding while parsing but...
+	// - we're likely to fail to get the correct encoding here anyway as we won't have enough information
+	// - and this doesn't really account for changes in encoding
+	var buf bytes.Buffer
+	for _, f := range diff.Files {
+		buf.Reset()
+		for _, sec := range f.Sections {
+			for _, l := range sec.Lines {
+				if l.Type == DiffLineSection {
+					continue
+				}
+				buf.WriteString(l.Content[1:])
+				buf.WriteString("\n")
+			}
+		}
+		charsetLabel, err := charset.DetectEncoding(buf.Bytes())
+		if charsetLabel != "UTF-8" && err == nil {
+			encoding, _ := stdcharset.Lookup(charsetLabel)
+			if encoding != nil {
+				d := encoding.NewDecoder()
+				for _, sec := range f.Sections {
+					for _, l := range sec.Lines {
+						if l.Type == DiffLineSection {
+							continue
+						}
+						if c, _, err := transform.String(d, l.Content[1:]); err == nil {
+							l.Content = l.Content[0:1] + c
+						}
+					}
+				}
+			}
+		}
+	}
+
+	return diff, nil
+}
+
+func parseHunks(curFile *DiffFile, maxLines, maxLineCharacters int, input *bufio.Reader) (lineBytes []byte, isFragment bool, err error) {
+	sb := strings.Builder{}
+
+	var (
+		curSection        *DiffSection
 		curFileLinesCount int
 		curFileLFSPrefix  bool
 	)

-	input := bufio.NewReader(reader)
-	isEOF := false
-	for !isEOF {
-		var linebuf bytes.Buffer
+	leftLine, rightLine := 1, 1
+
 	for {
-			b, err := input.ReadByte()
+		sb.Reset()
+		lineBytes, isFragment, err = input.ReadLine()
 		if err != nil {
 			if err == io.EOF {
-					isEOF = true
-					break
-				} else {
-					return nil, fmt.Errorf("ReadByte: %v", err)
+				return
 			}
+			err = fmt.Errorf("Unable to ReadLine: %v", err)
+			return
 		}
-			if b == '\n' {
-				break
-			}
-			if linebuf.Len() < maxLineCharacters {
-				linebuf.WriteByte(b)
-			} else if linebuf.Len() == maxLineCharacters {
-				curFile.IsIncomplete = true
-			}
-		}
-		line := linebuf.String()
-
-		if strings.HasPrefix(line, "+++ ") || strings.HasPrefix(line, "--- ") || len(line) == 0 {
-			continue
+		if lineBytes[0] == 'd' {
+			// End of hunks
+			return
 		}

-		trimLine := strings.Trim(line, "+- ")
-
-		if trimLine == models.LFSMetaFileIdentifier {
-			curFileLFSPrefix = true
-		}
-
-		if curFileLFSPrefix && strings.HasPrefix(trimLine, models.LFSMetaFileOidPrefix) {
-			oid := strings.TrimPrefix(trimLine, models.LFSMetaFileOidPrefix)
-
-			if len(oid) == 64 {
-				m := &models.LFSMetaObject{Oid: oid}
-				count, err := models.Count(m)
-
-				if err == nil && count > 0 {
-					curFile.IsBin = true
-					curFile.IsLFSFile = true
-					curSection.Lines = nil
-				}
-			}
-		}
-
-		curFileLinesCount++
-		lineCount++
-
-		// Diff data too large, we only show the first about maxLines lines
+		switch lineBytes[0] {
+		case '@':
 			if curFileLinesCount >= maxLines {
 				curFile.IsIncomplete = true
-		}
-		switch {
-		case line[0] == ' ':
-			diffLine := &DiffLine{Type: DiffLinePlain, Content: line, LeftIdx: leftLine, RightIdx: rightLine}
-			leftLine++
-			rightLine++
-			curSection.Lines = append(curSection.Lines, diffLine)
 				continue
-		case line[0] == '@':
+			}
+
+			_, _ = sb.Write(lineBytes)
+			for isFragment {
+				// This is very odd indeed - we're in a section header and the line is too long
+				// This really shouldn't happen...
+				lineBytes, isFragment, err = input.ReadLine()
+				if err != nil {
+					// Now by the definition of ReadLine this cannot be io.EOF
+					err = fmt.Errorf("Unable to ReadLine: %v", err)
+					return
+				}
+				_, _ = sb.Write(lineBytes)
+			}
+			line := sb.String()
+
+			// Create a new section to represent this hunk
 			curSection = &DiffSection{}
 			curFile.Sections = append(curFile.Sections, curSection)
+
 			lineSectionInfo := getDiffLineSectionInfo(curFile.Name, line, leftLine-1, rightLine-1)
 			diffLine := &DiffLine{
 				Type:        DiffLineSection,
@ -501,146 +661,132 @@ func ParsePatch(maxLines, maxLineCharacters, maxFiles int, reader io.Reader) (*D
 			leftLine = lineSectionInfo.LeftIdx
 			rightLine = lineSectionInfo.RightIdx
 			continue
-		case line[0] == '+':
+		case '\\':
+			if curFileLinesCount >= maxLines {
+				curFile.IsIncomplete = true
+				continue
+			}
+			// This is used only to indicate that the current file does not have a terminal newline
+			if !bytes.Equal(lineBytes, []byte("\\ No newline at end of file")) {
+				err = fmt.Errorf("Unexpected line in hunk: %s", string(lineBytes))
+				return
+			}
+			// Technically this should be the end the file!
+			// FIXME: we should be putting a marker at the end of the file if there is no terminal new line
+			continue
+		case '+':
+			curFileLinesCount++
 			curFile.Addition++
-			diff.TotalAddition++
-			diffLine := &DiffLine{Type: DiffLineAdd, Content: line, RightIdx: rightLine}
+			if curFileLinesCount >= maxLines {
+				curFile.IsIncomplete = true
+				continue
+			}
+			diffLine := &DiffLine{Type: DiffLineAdd, RightIdx: rightLine}
 			rightLine++
 			curSection.Lines = append(curSection.Lines, diffLine)
-			continue
-		case line[0] == '-':
+		case '-':
+			curFileLinesCount++
 			curFile.Deletion++
-			diff.TotalDeletion++
-			diffLine := &DiffLine{Type: DiffLineDel, Content: line, LeftIdx: leftLine}
+			if curFileLinesCount >= maxLines {
+				curFile.IsIncomplete = true
+				continue
+			}
+			diffLine := &DiffLine{Type: DiffLineDel, LeftIdx: leftLine}
 			if leftLine > 0 {
 				leftLine++
 			}
 			curSection.Lines = append(curSection.Lines, diffLine)
-		case strings.HasPrefix(line, "Binary"):
-			curFile.IsBin = true
+		case ' ':
+			curFileLinesCount++
+			if curFileLinesCount >= maxLines {
+				curFile.IsIncomplete = true
 				continue
 			}
+			diffLine := &DiffLine{Type: DiffLinePlain, LeftIdx: leftLine, RightIdx: rightLine}
+			leftLine++
+			rightLine++
+			curSection.Lines = append(curSection.Lines, diffLine)
+		default:
+			// This is unexpected
+			err = fmt.Errorf("Unexpected line in hunk: %s", string(lineBytes))
+			return
+		}

-		// Get new file.
-		if strings.HasPrefix(line, cmdDiffHead) {
-			if len(diff.Files) >= maxFiles {
-				diff.IsIncomplete = true
-				_, err := io.Copy(ioutil.Discard, reader)
+		line := string(lineBytes)
+		if isFragment {
+			curFile.IsIncomplete = true
+			for isFragment {
+				lineBytes, isFragment, err = input.ReadLine()
 				if err != nil {
-					return nil, fmt.Errorf("Copy: %v", err)
+					// Now by the definition of ReadLine this cannot be io.EOF
+					err = fmt.Errorf("Unable to ReadLine: %v", err)
+					return
+				}
+			}
+		}
+		curSection.Lines[len(curSection.Lines)-1].Content = line
+
+		// handle LFS
+		if line[1:] == models.LFSMetaFileIdentifier {
+			curFileLFSPrefix = true
+		} else if curFileLFSPrefix && strings.HasPrefix(line[1:], models.LFSMetaFileOidPrefix) {
+			oid := strings.TrimPrefix(line[1:], models.LFSMetaFileOidPrefix)
+			if len(oid) == 64 {
+				m := &models.LFSMetaObject{Oid: oid}
+				count, err := models.Count(m)
+
+				if err == nil && count > 0 {
+					curFile.IsBin = true
+					curFile.IsLFSFile = true
+					curSection.Lines = nil
+				}
+			}
+		}
 	}
-				break
 }

-			// Note: In case file name is surrounded by double quotes (it happens only in git-shell).
-			// e.g. diff --git "a/xxx" "b/xxx"
-			var a string
-			var b string
-
-			rd := strings.NewReader(line[len(cmdDiffHead):])
-			char, _ := rd.ReadByte()
-			_ = rd.UnreadByte()
-			if char == '"' {
-				fmt.Fscanf(rd, "%q ", &a)
-				if a[0] == '\\' {
-					a = a[1:]
-				}
-			} else {
-				fmt.Fscanf(rd, "%s ", &a)
-			}
-			char, _ = rd.ReadByte()
-			_ = rd.UnreadByte()
-			if char == '"' {
-				fmt.Fscanf(rd, "%q", &b)
-				if b[0] == '\\' {
-					b = b[1:]
-				}
-			} else {
-				fmt.Fscanf(rd, "%s", &b)
-			}
-			a = a[2:]
-			b = b[2:]
-
-			curFile = &DiffFile{
-				Name:      b,
-				OldName:   a,
+func createDiffFile(diff *Diff, line string) *DiffFile {
+	// The a/ and b/ filenames are the same unless rename/copy is involved.
+	// Especially, even for a creation or a deletion, /dev/null is not used
+	// in place of the a/ or b/ filenames.
+	//
+	// When rename/copy is involved, file1 and file2 show the name of the
+	// source file of the rename/copy and the name of the file that rename/copy
+	// produces, respectively.
+	//
+	// Path names are quoted if necessary.
+	//
+	// This means that you should always be able to determine the file name even when there
+	// there is potential ambiguity...
+	//
+	// but we can be simpler with our heuristics by just forcing git to prefix things nicely
+	curFile := &DiffFile{
 		Index:    len(diff.Files) + 1,
 		Type:     DiffFileChange,
 		Sections: make([]*DiffSection, 0, 10),
-				IsRenamed: a != b,
-			}
-			diff.Files = append(diff.Files, curFile)
-			curFileLinesCount = 0
-			leftLine = 1
-			rightLine = 1
-			curFileLFSPrefix = false
-
-			// Check file diff type and is submodule.
-			for {
-				line, err := input.ReadString('\n')
-				if err != nil {
-					if err == io.EOF {
-						isEOF = true
-					} else {
-						return nil, fmt.Errorf("ReadString: %v", err)
-					}
 	}

-				switch {
-				case strings.HasPrefix(line, "copy from "):
-					curFile.IsRenamed = true
-					curFile.Type = DiffFileCopy
-				case strings.HasPrefix(line, "copy to "):
-					curFile.IsRenamed = true
-					curFile.Type = DiffFileCopy
-				case strings.HasPrefix(line, "new file"):
-					curFile.Type = DiffFileAdd
-					curFile.IsCreated = true
-				case strings.HasPrefix(line, "deleted"):
-					curFile.Type = DiffFileDel
-					curFile.IsDeleted = true
-				case strings.HasPrefix(line, "index"):
+	rd := strings.NewReader(line[len(cmdDiffHead):] + " ")
 	curFile.Type = DiffFileChange
-				case strings.HasPrefix(line, "similarity index 100%"):
-					curFile.Type = DiffFileRename
-				}
-				if curFile.Type > 0 {
-					if strings.HasSuffix(line, " 160000\n") {
-						curFile.IsSubmodule = true
-					}
-					break
-				}
-			}
-		}
+	curFile.OldName = readFileName(rd)
+	curFile.Name = readFileName(rd)
+	curFile.IsRenamed = curFile.Name != curFile.OldName
+	return curFile
 }

-	// FIXME: detect encoding while parsing.
-	var buf bytes.Buffer
-	for _, f := range diff.Files {
-		buf.Reset()
-		for _, sec := range f.Sections {
-			for _, l := range sec.Lines {
-				buf.WriteString(l.Content)
-				buf.WriteString("\n")
+func readFileName(rd *strings.Reader) string {
+	var name string
+	char, _ := rd.ReadByte()
+	_ = rd.UnreadByte()
+	if char == '"' {
+		fmt.Fscanf(rd, "%q ", &name)
+		if name[0] == '\\' {
+			name = name[1:]
 		}
+	} else {
+		fmt.Fscanf(rd, "%s ", &name)
 	}
-		charsetLabel, err := charset.DetectEncoding(buf.Bytes())
-		if charsetLabel != "UTF-8" && err == nil {
-			encoding, _ := stdcharset.Lookup(charsetLabel)
-			if encoding != nil {
-				d := encoding.NewDecoder()
-				for _, sec := range f.Sections {
-					for _, l := range sec.Lines {
-						if c, _, err := transform.String(d, l.Content); err == nil {
-							l.Content = c
-						}
-					}
-				}
-			}
-		}
-	}
-
-	return diff, nil
+	return name[2:]
 }

 // GetDiffRange builds a Diff between two commits of a repository.
--- a/services/gitdiff/gitdiff_test.go
+++ b/services/gitdiff/gitdiff_test.go
@ -148,6 +148,27 @@ rename to a b/a a/file b/b file
 			oldFilename: "a b/file b/a a/file",
 			filename:    "a b/a a/file b/b file",
 		},
+		{
+			name: "minuses-and-pluses",
+			gitdiff: `diff --git a/minuses-and-pluses b/minuses-and-pluses
+index 6961180..9ba1a00 100644
+--- a/minuses-and-pluses
+++ b/minuses-and-pluses
+@@ -1,4 +1,4 @@
+--- 1st line
+-++ 2nd line
+--- 3rd line
+-++ 4th line
+++ 1st line
+-- 2nd line
+++ 3rd line
+-- 4th line
+`,
+			oldFilename: "minuses-and-pluses",
+			filename:    "minuses-and-pluses",
+			addition:    4,
+			deletion:    4,
+		},
 	}

 	for _, testcase := range tests {