mirror of
				https://github.com/go-gitea/gitea
				synced 2025-11-03 21:08:25 +00:00 
			
		
		
		
	
		
			
				
	
	
		
			180 lines
		
	
	
		
			3.5 KiB
		
	
	
	
		
			Go
		
	
	
	
	
	
			
		
		
	
	
			180 lines
		
	
	
		
			3.5 KiB
		
	
	
	
		
			Go
		
	
	
	
	
	
package mahonia
 | 
						|
 | 
						|
// decoding HTML entities
 | 
						|
 | 
						|
import (
 | 
						|
	"sort"
 | 
						|
)
 | 
						|
 | 
						|
// EntityDecoder returns a Decoder that decodes HTML character entities.
 | 
						|
// If there is no valid character entity at the current position, it returns INVALID_CHAR.
 | 
						|
// So it needs to be combined with another Decoder via FallbackDecoder.
 | 
						|
func EntityDecoder() Decoder {
 | 
						|
	var leftover rune // leftover rune from two-rune entity
 | 
						|
	return func(p []byte) (r rune, size int, status Status) {
 | 
						|
		if leftover != 0 {
 | 
						|
			r = leftover
 | 
						|
			leftover = 0
 | 
						|
			return r, 0, SUCCESS
 | 
						|
		}
 | 
						|
 | 
						|
		if len(p) == 0 {
 | 
						|
			return 0, 0, NO_ROOM
 | 
						|
		}
 | 
						|
 | 
						|
		if p[0] != '&' {
 | 
						|
			return 0xfffd, 1, INVALID_CHAR
 | 
						|
		}
 | 
						|
 | 
						|
		if len(p) < 3 {
 | 
						|
			return 0, 1, NO_ROOM
 | 
						|
		}
 | 
						|
 | 
						|
		r, size, status = 0xfffd, 1, INVALID_CHAR
 | 
						|
		n := 1 // number of bytes read so far
 | 
						|
 | 
						|
		if p[n] == '#' {
 | 
						|
			n++
 | 
						|
			c := p[n]
 | 
						|
			hex := false
 | 
						|
			if c == 'x' || c == 'X' {
 | 
						|
				hex = true
 | 
						|
				n++
 | 
						|
			}
 | 
						|
 | 
						|
			var x rune
 | 
						|
			for n < len(p) {
 | 
						|
				c = p[n]
 | 
						|
				n++
 | 
						|
				if hex {
 | 
						|
					if '0' <= c && c <= '9' {
 | 
						|
						x = 16*x + rune(c) - '0'
 | 
						|
						continue
 | 
						|
					} else if 'a' <= c && c <= 'f' {
 | 
						|
						x = 16*x + rune(c) - 'a' + 10
 | 
						|
						continue
 | 
						|
					} else if 'A' <= c && c <= 'F' {
 | 
						|
						x = 16*x + rune(c) - 'A' + 10
 | 
						|
						continue
 | 
						|
					}
 | 
						|
				} else if '0' <= c && c <= '9' {
 | 
						|
					x = 10*x + rune(c) - '0'
 | 
						|
					continue
 | 
						|
				}
 | 
						|
				if c != ';' {
 | 
						|
					n--
 | 
						|
				}
 | 
						|
				break
 | 
						|
			}
 | 
						|
 | 
						|
			if n == len(p) && p[n-1] != ';' {
 | 
						|
				return 0, 0, NO_ROOM
 | 
						|
			}
 | 
						|
 | 
						|
			size = n
 | 
						|
			if p[n-1] == ';' {
 | 
						|
				n--
 | 
						|
			}
 | 
						|
			if hex {
 | 
						|
				n--
 | 
						|
			}
 | 
						|
			n--
 | 
						|
			// Now n is the number of actual digits read.
 | 
						|
			if n == 0 {
 | 
						|
				return 0xfffd, 1, INVALID_CHAR
 | 
						|
			}
 | 
						|
 | 
						|
			if 0x80 <= x && x <= 0x9F {
 | 
						|
				// Replace characters from Windows-1252 with UTF-8 equivalents.
 | 
						|
				x = replacementTable[x-0x80]
 | 
						|
			} else if x == 0 || (0xD800 <= x && x <= 0xDFFF) || x > 0x10FFFF {
 | 
						|
				// Replace invalid characters with the replacement character.
 | 
						|
				return 0xfffd, size, INVALID_CHAR
 | 
						|
			}
 | 
						|
 | 
						|
			r = x
 | 
						|
			status = SUCCESS
 | 
						|
			return
 | 
						|
		}
 | 
						|
 | 
						|
		// Look for a named entity in EntityList.
 | 
						|
 | 
						|
		possible := entityList
 | 
						|
		for len(possible) > 0 {
 | 
						|
			if len(p) <= n {
 | 
						|
				leftover = 0
 | 
						|
				return 0, 0, NO_ROOM
 | 
						|
			}
 | 
						|
 | 
						|
			c := p[n]
 | 
						|
 | 
						|
			// Narrow down the selection in possible to those items that have c in the
 | 
						|
			// appropriate byte.
 | 
						|
			first := sort.Search(len(possible), func(i int) bool {
 | 
						|
				e := possible[i].name
 | 
						|
				if len(e) < n {
 | 
						|
					return false
 | 
						|
				}
 | 
						|
				return e[n-1] >= c
 | 
						|
			})
 | 
						|
			possible = possible[first:]
 | 
						|
			last := sort.Search(len(possible), func(i int) bool {
 | 
						|
				return possible[i].name[n-1] > c
 | 
						|
			})
 | 
						|
			possible = possible[:last]
 | 
						|
 | 
						|
			n++
 | 
						|
			if len(possible) > 0 && len(possible[0].name) == n-1 {
 | 
						|
				r, leftover = possible[0].r1, possible[0].r2
 | 
						|
				size = n
 | 
						|
				status = SUCCESS
 | 
						|
				// but don't return yet, since we need the longest match
 | 
						|
			}
 | 
						|
		}
 | 
						|
 | 
						|
		return
 | 
						|
	}
 | 
						|
}
 | 
						|
 | 
						|
// This table is copied from /src/pkg/html/escape.go in the Go source
 | 
						|
//
 | 
						|
// These replacements permit compatibility with old numeric entities that
 | 
						|
// assumed Windows-1252 encoding.
 | 
						|
// http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#consume-a-character-reference
 | 
						|
var replacementTable = [...]rune{
 | 
						|
	'\u20AC', // First entry is what 0x80 should be replaced with.
 | 
						|
	'\u0081',
 | 
						|
	'\u201A',
 | 
						|
	'\u0192',
 | 
						|
	'\u201E',
 | 
						|
	'\u2026',
 | 
						|
	'\u2020',
 | 
						|
	'\u2021',
 | 
						|
	'\u02C6',
 | 
						|
	'\u2030',
 | 
						|
	'\u0160',
 | 
						|
	'\u2039',
 | 
						|
	'\u0152',
 | 
						|
	'\u008D',
 | 
						|
	'\u017D',
 | 
						|
	'\u008F',
 | 
						|
	'\u0090',
 | 
						|
	'\u2018',
 | 
						|
	'\u2019',
 | 
						|
	'\u201C',
 | 
						|
	'\u201D',
 | 
						|
	'\u2022',
 | 
						|
	'\u2013',
 | 
						|
	'\u2014',
 | 
						|
	'\u02DC',
 | 
						|
	'\u2122',
 | 
						|
	'\u0161',
 | 
						|
	'\u203A',
 | 
						|
	'\u0153',
 | 
						|
	'\u009D',
 | 
						|
	'\u017E',
 | 
						|
	'\u0178', // Last entry is 0x9F.
 | 
						|
	// 0x00->'\uFFFD' is handled programmatically.
 | 
						|
	// 0x0D->'\u000D' is a no-op.
 | 
						|
}
 |