1
1
mirror of https://github.com/go-gitea/gitea synced 2025-07-22 10:18:38 +00:00

Switch Unicode Escaping to a VSCode-like system (#19990)

This PR rewrites the invisible unicode detection algorithm to more
closely match that of the Monaco editor on the system. It provides a
technique for detecting ambiguous characters and relaxes the detection
of combining marks.

Control characters are in addition detected as invisible in this
implementation whereas they are not on monaco but this is related to
font issues.

Close #19913

Signed-off-by: Andrew Thornton <art27@cantab.net>
This commit is contained in:
zeripath
2022-08-13 19:32:34 +01:00
committed by GitHub
parent 11dc6df5be
commit 99efa02edf
29 changed files with 2107 additions and 371 deletions

View File

@@ -1,236 +1,58 @@
// Copyright 2021 The Gitea Authors. All rights reserved.
// Copyright 2022 The Gitea Authors. All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
//go:generate go run invisible/generate.go -v -o ./invisible_gen.go
//go:generate go run ambiguous/generate.go -v -o ./ambiguous_gen.go ambiguous/ambiguous.json
package charset
import (
"bytes"
"fmt"
"io"
"strings"
"unicode"
"unicode/utf8"
"golang.org/x/text/unicode/bidi"
"code.gitea.io/gitea/modules/log"
"code.gitea.io/gitea/modules/translation"
)
// EscapeStatus represents the findings of the unicode escaper
type EscapeStatus struct {
Escaped bool
HasError bool
HasBadRunes bool
HasControls bool
HasSpaces bool
HasMarks bool
HasBIDI bool
BadBIDI bool
HasRTLScript bool
HasLTRScript bool
// RuneNBSP is the codepoint for NBSP
const RuneNBSP = 0xa0
// EscapeControlHTML escapes the unicode control sequences in a provided html document
func EscapeControlHTML(text string, locale translation.Locale, allowed ...rune) (escaped *EscapeStatus, output string) {
sb := &strings.Builder{}
outputStream := &HTMLStreamerWriter{Writer: sb}
streamer := NewEscapeStreamer(locale, outputStream, allowed...).(*escapeStreamer)
if err := StreamHTML(strings.NewReader(text), streamer); err != nil {
streamer.escaped.HasError = true
log.Error("Error whilst escaping: %v", err)
}
return streamer.escaped, sb.String()
}
// Or combines two EscapeStatus structs into one representing the conjunction of the two
func (status EscapeStatus) Or(other EscapeStatus) EscapeStatus {
st := status
st.Escaped = st.Escaped || other.Escaped
st.HasError = st.HasError || other.HasError
st.HasBadRunes = st.HasBadRunes || other.HasBadRunes
st.HasControls = st.HasControls || other.HasControls
st.HasSpaces = st.HasSpaces || other.HasSpaces
st.HasMarks = st.HasMarks || other.HasMarks
st.HasBIDI = st.HasBIDI || other.HasBIDI
st.BadBIDI = st.BadBIDI || other.BadBIDI
st.HasRTLScript = st.HasRTLScript || other.HasRTLScript
st.HasLTRScript = st.HasLTRScript || other.HasLTRScript
return st
// EscapeControlReaders escapes the unicode control sequences in a provider reader and writer in a locale and returns the findings as an EscapeStatus and the escaped []byte
func EscapeControlReader(reader io.Reader, writer io.Writer, locale translation.Locale, allowed ...rune) (escaped *EscapeStatus, err error) {
outputStream := &HTMLStreamerWriter{Writer: writer}
streamer := NewEscapeStreamer(locale, outputStream, allowed...).(*escapeStreamer)
if err = StreamHTML(reader, streamer); err != nil {
streamer.escaped.HasError = true
log.Error("Error whilst escaping: %v", err)
}
return streamer.escaped, err
}
// EscapeControlString escapes the unicode control sequences in a provided string and returns the findings as an EscapeStatus and the escaped string
func EscapeControlString(text string) (EscapeStatus, string) {
func EscapeControlString(text string, locale translation.Locale, allowed ...rune) (escaped *EscapeStatus, output string) {
sb := &strings.Builder{}
escaped, _ := EscapeControlReader(strings.NewReader(text), sb)
return escaped, sb.String()
}
outputStream := &HTMLStreamerWriter{Writer: sb}
streamer := NewEscapeStreamer(locale, outputStream, allowed...).(*escapeStreamer)
// EscapeControlBytes escapes the unicode control sequences a provided []byte and returns the findings as an EscapeStatus and the escaped []byte
func EscapeControlBytes(text []byte) (EscapeStatus, []byte) {
buf := &bytes.Buffer{}
escaped, _ := EscapeControlReader(bytes.NewReader(text), buf)
return escaped, buf.Bytes()
}
// EscapeControlReader escapes the unicode control sequences a provided Reader writing the escaped output to the output and returns the findings as an EscapeStatus and an error
func EscapeControlReader(text io.Reader, output io.Writer) (escaped EscapeStatus, err error) {
buf := make([]byte, 4096)
readStart := 0
runeCount := 0
var n int
var writePos int
lineHasBIDI := false
lineHasRTLScript := false
lineHasLTRScript := false
readingloop:
for err == nil {
n, err = text.Read(buf[readStart:])
bs := buf[:n+readStart]
n = len(bs)
i := 0
for i < len(bs) {
r, size := utf8.DecodeRune(bs[i:])
runeCount++
// Now handle the codepoints
switch {
case r == utf8.RuneError:
if writePos < i {
if _, err = output.Write(bs[writePos:i]); err != nil {
escaped.HasError = true
return
}
writePos = i
}
// runes can be at most 4 bytes - so...
if len(bs)-i <= 3 {
// if not request more data
copy(buf, bs[i:])
readStart = n - i
writePos = 0
continue readingloop
}
// this is a real broken rune
escaped.HasBadRunes = true
escaped.Escaped = true
if err = writeBroken(output, bs[i:i+size]); err != nil {
escaped.HasError = true
return
}
writePos += size
case r == '\n':
if lineHasBIDI && !lineHasRTLScript && lineHasLTRScript {
escaped.BadBIDI = true
}
lineHasBIDI = false
lineHasRTLScript = false
lineHasLTRScript = false
case runeCount == 1 && r == 0xFEFF: // UTF BOM
// the first BOM is safe
case r == '\r' || r == '\t' || r == ' ':
// These are acceptable control characters and space characters
case unicode.IsSpace(r):
escaped.HasSpaces = true
escaped.Escaped = true
if writePos < i {
if _, err = output.Write(bs[writePos:i]); err != nil {
escaped.HasError = true
return
}
}
if err = writeEscaped(output, r); err != nil {
escaped.HasError = true
return
}
writePos = i + size
case unicode.Is(unicode.Bidi_Control, r):
escaped.Escaped = true
escaped.HasBIDI = true
if writePos < i {
if _, err = output.Write(bs[writePos:i]); err != nil {
escaped.HasError = true
return
}
}
lineHasBIDI = true
if err = writeEscaped(output, r); err != nil {
escaped.HasError = true
return
}
writePos = i + size
case unicode.Is(unicode.C, r):
escaped.Escaped = true
escaped.HasControls = true
if writePos < i {
if _, err = output.Write(bs[writePos:i]); err != nil {
escaped.HasError = true
return
}
}
if err = writeEscaped(output, r); err != nil {
escaped.HasError = true
return
}
writePos = i + size
case unicode.Is(unicode.M, r):
escaped.Escaped = true
escaped.HasMarks = true
if writePos < i {
if _, err = output.Write(bs[writePos:i]); err != nil {
escaped.HasError = true
return
}
}
if err = writeEscaped(output, r); err != nil {
escaped.HasError = true
return
}
writePos = i + size
default:
p, _ := bidi.Lookup(bs[i : i+size])
c := p.Class()
if c == bidi.R || c == bidi.AL {
lineHasRTLScript = true
escaped.HasRTLScript = true
} else if c == bidi.L {
lineHasLTRScript = true
escaped.HasLTRScript = true
}
}
i += size
}
if n > 0 {
// we read something...
// write everything unwritten
if writePos < i {
if _, err = output.Write(bs[writePos:i]); err != nil {
escaped.HasError = true
return
}
}
// reset the starting positions for the next read
readStart = 0
writePos = 0
}
if err := streamer.Text(text); err != nil {
streamer.escaped.HasError = true
log.Error("Error whilst escaping: %v", err)
}
if readStart > 0 {
// this means that there is an incomplete or broken rune at 0-readStart and we read nothing on the last go round
escaped.Escaped = true
escaped.HasBadRunes = true
if err = writeBroken(output, buf[:readStart]); err != nil {
escaped.HasError = true
return
}
}
if err == io.EOF {
if lineHasBIDI && !lineHasRTLScript && lineHasLTRScript {
escaped.BadBIDI = true
}
err = nil
return
}
escaped.HasError = true
return escaped, err
}
func writeBroken(output io.Writer, bs []byte) (err error) {
_, err = fmt.Fprintf(output, `<span class="broken-code-point">&lt;%X&gt;</span>`, bs)
return err
}
func writeEscaped(output io.Writer, r rune) (err error) {
_, err = fmt.Fprintf(output, `<span class="escaped-code-point" data-escaped="[U+%04X]"><span class="char">%c</span></span>`, r, r)
return err
return streamer.escaped, sb.String()
}