mirror of
https://github.com/go-gitea/gitea
synced 2025-07-22 10:18:38 +00:00
Switch Unicode Escaping to a VSCode-like system (#19990)
This PR rewrites the invisible unicode detection algorithm to more closely match that of the Monaco editor on the system. It provides a technique for detecting ambiguous characters and relaxes the detection of combining marks. Control characters are in addition detected as invisible in this implementation whereas they are not on monaco but this is related to font issues. Close #19913 Signed-off-by: Andrew Thornton <art27@cantab.net>
This commit is contained in:
@@ -1,236 +1,58 @@
|
||||
// Copyright 2021 The Gitea Authors. All rights reserved.
|
||||
// Copyright 2022 The Gitea Authors. All rights reserved.
|
||||
// Use of this source code is governed by a MIT-style
|
||||
// license that can be found in the LICENSE file.
|
||||
|
||||
//go:generate go run invisible/generate.go -v -o ./invisible_gen.go
|
||||
|
||||
//go:generate go run ambiguous/generate.go -v -o ./ambiguous_gen.go ambiguous/ambiguous.json
|
||||
|
||||
package charset
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"io"
|
||||
"strings"
|
||||
"unicode"
|
||||
"unicode/utf8"
|
||||
|
||||
"golang.org/x/text/unicode/bidi"
|
||||
"code.gitea.io/gitea/modules/log"
|
||||
"code.gitea.io/gitea/modules/translation"
|
||||
)
|
||||
|
||||
// EscapeStatus represents the findings of the unicode escaper
|
||||
type EscapeStatus struct {
|
||||
Escaped bool
|
||||
HasError bool
|
||||
HasBadRunes bool
|
||||
HasControls bool
|
||||
HasSpaces bool
|
||||
HasMarks bool
|
||||
HasBIDI bool
|
||||
BadBIDI bool
|
||||
HasRTLScript bool
|
||||
HasLTRScript bool
|
||||
// RuneNBSP is the codepoint for NBSP
|
||||
const RuneNBSP = 0xa0
|
||||
|
||||
// EscapeControlHTML escapes the unicode control sequences in a provided html document
|
||||
func EscapeControlHTML(text string, locale translation.Locale, allowed ...rune) (escaped *EscapeStatus, output string) {
|
||||
sb := &strings.Builder{}
|
||||
outputStream := &HTMLStreamerWriter{Writer: sb}
|
||||
streamer := NewEscapeStreamer(locale, outputStream, allowed...).(*escapeStreamer)
|
||||
|
||||
if err := StreamHTML(strings.NewReader(text), streamer); err != nil {
|
||||
streamer.escaped.HasError = true
|
||||
log.Error("Error whilst escaping: %v", err)
|
||||
}
|
||||
return streamer.escaped, sb.String()
|
||||
}
|
||||
|
||||
// Or combines two EscapeStatus structs into one representing the conjunction of the two
|
||||
func (status EscapeStatus) Or(other EscapeStatus) EscapeStatus {
|
||||
st := status
|
||||
st.Escaped = st.Escaped || other.Escaped
|
||||
st.HasError = st.HasError || other.HasError
|
||||
st.HasBadRunes = st.HasBadRunes || other.HasBadRunes
|
||||
st.HasControls = st.HasControls || other.HasControls
|
||||
st.HasSpaces = st.HasSpaces || other.HasSpaces
|
||||
st.HasMarks = st.HasMarks || other.HasMarks
|
||||
st.HasBIDI = st.HasBIDI || other.HasBIDI
|
||||
st.BadBIDI = st.BadBIDI || other.BadBIDI
|
||||
st.HasRTLScript = st.HasRTLScript || other.HasRTLScript
|
||||
st.HasLTRScript = st.HasLTRScript || other.HasLTRScript
|
||||
return st
|
||||
// EscapeControlReaders escapes the unicode control sequences in a provider reader and writer in a locale and returns the findings as an EscapeStatus and the escaped []byte
|
||||
func EscapeControlReader(reader io.Reader, writer io.Writer, locale translation.Locale, allowed ...rune) (escaped *EscapeStatus, err error) {
|
||||
outputStream := &HTMLStreamerWriter{Writer: writer}
|
||||
streamer := NewEscapeStreamer(locale, outputStream, allowed...).(*escapeStreamer)
|
||||
|
||||
if err = StreamHTML(reader, streamer); err != nil {
|
||||
streamer.escaped.HasError = true
|
||||
log.Error("Error whilst escaping: %v", err)
|
||||
}
|
||||
return streamer.escaped, err
|
||||
}
|
||||
|
||||
// EscapeControlString escapes the unicode control sequences in a provided string and returns the findings as an EscapeStatus and the escaped string
|
||||
func EscapeControlString(text string) (EscapeStatus, string) {
|
||||
func EscapeControlString(text string, locale translation.Locale, allowed ...rune) (escaped *EscapeStatus, output string) {
|
||||
sb := &strings.Builder{}
|
||||
escaped, _ := EscapeControlReader(strings.NewReader(text), sb)
|
||||
return escaped, sb.String()
|
||||
}
|
||||
outputStream := &HTMLStreamerWriter{Writer: sb}
|
||||
streamer := NewEscapeStreamer(locale, outputStream, allowed...).(*escapeStreamer)
|
||||
|
||||
// EscapeControlBytes escapes the unicode control sequences a provided []byte and returns the findings as an EscapeStatus and the escaped []byte
|
||||
func EscapeControlBytes(text []byte) (EscapeStatus, []byte) {
|
||||
buf := &bytes.Buffer{}
|
||||
escaped, _ := EscapeControlReader(bytes.NewReader(text), buf)
|
||||
return escaped, buf.Bytes()
|
||||
}
|
||||
|
||||
// EscapeControlReader escapes the unicode control sequences a provided Reader writing the escaped output to the output and returns the findings as an EscapeStatus and an error
|
||||
func EscapeControlReader(text io.Reader, output io.Writer) (escaped EscapeStatus, err error) {
|
||||
buf := make([]byte, 4096)
|
||||
readStart := 0
|
||||
runeCount := 0
|
||||
var n int
|
||||
var writePos int
|
||||
|
||||
lineHasBIDI := false
|
||||
lineHasRTLScript := false
|
||||
lineHasLTRScript := false
|
||||
|
||||
readingloop:
|
||||
for err == nil {
|
||||
n, err = text.Read(buf[readStart:])
|
||||
bs := buf[:n+readStart]
|
||||
n = len(bs)
|
||||
i := 0
|
||||
|
||||
for i < len(bs) {
|
||||
r, size := utf8.DecodeRune(bs[i:])
|
||||
runeCount++
|
||||
|
||||
// Now handle the codepoints
|
||||
switch {
|
||||
case r == utf8.RuneError:
|
||||
if writePos < i {
|
||||
if _, err = output.Write(bs[writePos:i]); err != nil {
|
||||
escaped.HasError = true
|
||||
return
|
||||
}
|
||||
writePos = i
|
||||
}
|
||||
// runes can be at most 4 bytes - so...
|
||||
if len(bs)-i <= 3 {
|
||||
// if not request more data
|
||||
copy(buf, bs[i:])
|
||||
readStart = n - i
|
||||
writePos = 0
|
||||
continue readingloop
|
||||
}
|
||||
// this is a real broken rune
|
||||
escaped.HasBadRunes = true
|
||||
escaped.Escaped = true
|
||||
if err = writeBroken(output, bs[i:i+size]); err != nil {
|
||||
escaped.HasError = true
|
||||
return
|
||||
}
|
||||
writePos += size
|
||||
case r == '\n':
|
||||
if lineHasBIDI && !lineHasRTLScript && lineHasLTRScript {
|
||||
escaped.BadBIDI = true
|
||||
}
|
||||
lineHasBIDI = false
|
||||
lineHasRTLScript = false
|
||||
lineHasLTRScript = false
|
||||
|
||||
case runeCount == 1 && r == 0xFEFF: // UTF BOM
|
||||
// the first BOM is safe
|
||||
case r == '\r' || r == '\t' || r == ' ':
|
||||
// These are acceptable control characters and space characters
|
||||
case unicode.IsSpace(r):
|
||||
escaped.HasSpaces = true
|
||||
escaped.Escaped = true
|
||||
if writePos < i {
|
||||
if _, err = output.Write(bs[writePos:i]); err != nil {
|
||||
escaped.HasError = true
|
||||
return
|
||||
}
|
||||
}
|
||||
if err = writeEscaped(output, r); err != nil {
|
||||
escaped.HasError = true
|
||||
return
|
||||
}
|
||||
writePos = i + size
|
||||
case unicode.Is(unicode.Bidi_Control, r):
|
||||
escaped.Escaped = true
|
||||
escaped.HasBIDI = true
|
||||
if writePos < i {
|
||||
if _, err = output.Write(bs[writePos:i]); err != nil {
|
||||
escaped.HasError = true
|
||||
return
|
||||
}
|
||||
}
|
||||
lineHasBIDI = true
|
||||
if err = writeEscaped(output, r); err != nil {
|
||||
escaped.HasError = true
|
||||
return
|
||||
}
|
||||
writePos = i + size
|
||||
case unicode.Is(unicode.C, r):
|
||||
escaped.Escaped = true
|
||||
escaped.HasControls = true
|
||||
if writePos < i {
|
||||
if _, err = output.Write(bs[writePos:i]); err != nil {
|
||||
escaped.HasError = true
|
||||
return
|
||||
}
|
||||
}
|
||||
if err = writeEscaped(output, r); err != nil {
|
||||
escaped.HasError = true
|
||||
return
|
||||
}
|
||||
writePos = i + size
|
||||
case unicode.Is(unicode.M, r):
|
||||
escaped.Escaped = true
|
||||
escaped.HasMarks = true
|
||||
if writePos < i {
|
||||
if _, err = output.Write(bs[writePos:i]); err != nil {
|
||||
escaped.HasError = true
|
||||
return
|
||||
}
|
||||
}
|
||||
if err = writeEscaped(output, r); err != nil {
|
||||
escaped.HasError = true
|
||||
return
|
||||
}
|
||||
writePos = i + size
|
||||
default:
|
||||
p, _ := bidi.Lookup(bs[i : i+size])
|
||||
c := p.Class()
|
||||
if c == bidi.R || c == bidi.AL {
|
||||
lineHasRTLScript = true
|
||||
escaped.HasRTLScript = true
|
||||
} else if c == bidi.L {
|
||||
lineHasLTRScript = true
|
||||
escaped.HasLTRScript = true
|
||||
}
|
||||
}
|
||||
i += size
|
||||
}
|
||||
if n > 0 {
|
||||
// we read something...
|
||||
// write everything unwritten
|
||||
if writePos < i {
|
||||
if _, err = output.Write(bs[writePos:i]); err != nil {
|
||||
escaped.HasError = true
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
// reset the starting positions for the next read
|
||||
readStart = 0
|
||||
writePos = 0
|
||||
}
|
||||
if err := streamer.Text(text); err != nil {
|
||||
streamer.escaped.HasError = true
|
||||
log.Error("Error whilst escaping: %v", err)
|
||||
}
|
||||
if readStart > 0 {
|
||||
// this means that there is an incomplete or broken rune at 0-readStart and we read nothing on the last go round
|
||||
escaped.Escaped = true
|
||||
escaped.HasBadRunes = true
|
||||
if err = writeBroken(output, buf[:readStart]); err != nil {
|
||||
escaped.HasError = true
|
||||
return
|
||||
}
|
||||
}
|
||||
if err == io.EOF {
|
||||
if lineHasBIDI && !lineHasRTLScript && lineHasLTRScript {
|
||||
escaped.BadBIDI = true
|
||||
}
|
||||
err = nil
|
||||
return
|
||||
}
|
||||
escaped.HasError = true
|
||||
return escaped, err
|
||||
}
|
||||
|
||||
func writeBroken(output io.Writer, bs []byte) (err error) {
|
||||
_, err = fmt.Fprintf(output, `<span class="broken-code-point"><%X></span>`, bs)
|
||||
return err
|
||||
}
|
||||
|
||||
func writeEscaped(output io.Writer, r rune) (err error) {
|
||||
_, err = fmt.Fprintf(output, `<span class="escaped-code-point" data-escaped="[U+%04X]"><span class="char">%c</span></span>`, r, r)
|
||||
return err
|
||||
return streamer.escaped, sb.String()
|
||||
}
|
||||
|
Reference in New Issue
Block a user