Strip invalid XML characters to avoid parsing errors

2018-02-07 20:57:56 -08:00 · 2018-02-07 20:57:56 -08:00 · 7b0bfd9308
parent c6fd9eb9b1
commit 7b0bfd9308
1 changed files with 26 additions and 1 deletions
--- a/reader/feed/parser.go
+++ b/reader/feed/parser.go
@ -12,6 +12,7 @@ import (
 	"strings"
 	"time"

+	"github.com/miniflux/miniflux/logger"
 	"github.com/miniflux/miniflux/model"
 	"github.com/miniflux/miniflux/reader/atom"
 	"github.com/miniflux/miniflux/reader/encoding"
@ -74,7 +75,8 @@ func parseFeed(r io.Reader) (*model.Feed, error) {
 		return nil, errors.New("This feed is empty")
 	}

-	reader := bytes.NewReader(buffer.Bytes())
+	str := stripInvalidXMLCharacters(buffer.String())
+	reader := strings.NewReader(str)
 	format := DetectFeedFormat(reader)
 	reader.Seek(0, io.SeekStart)

@ -91,3 +93,26 @@ func parseFeed(r io.Reader) (*model.Feed, error) {
 		return nil, errors.New("Unsupported feed format")
 	}
 }
+
+func stripInvalidXMLCharacters(input string) string {
+	return strings.Map(func(r rune) rune {
+		if isInCharacterRange(r) {
+			return r
+		}
+
+		logger.Debug("Strip invalid XML characters: %U", r)
+		return -1
+	}, input)
+}
+
+// Decide whether the given rune is in the XML Character Range, per
+// the Char production of http://www.xml.com/axml/testaxml.htm,
+// Section 2.2 Characters.
+func isInCharacterRange(r rune) (inrange bool) {
+	return r == 0x09 ||
+		r == 0x0A ||
+		r == 0x0D ||
+		r >= 0x20 && r <= 0xDF77 ||
+		r >= 0xE000 && r <= 0xFFFD ||
+		r >= 0x10000 && r <= 0x10FFFF
+}