123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167 |
- package stream
- import (
- "bytes"
- "encoding/xml"
- "fmt"
- "io"
- "unicode"
- "golang.org/x/net/html/charset"
- )
- type frame struct {
- HasNested bool
- TotalNested int
- AttrCount int
- }
- func run(xmlDecoder *xml.Decoder, reader io.Reader, writer io.Writer) {
- writeBuffer := bytes.NewBufferString("")
- var tokenToProcess interface{}
- stack := []frame{}
- stackSize := 0
- var prevToken interface{}
- for {
- tokenI := tokenToProcess
- tokenToProcess = nil
- if tokenI == nil {
- tokenI, _ = xmlDecoder.Token()
- }
- if tokenI == nil {
- break
- }
- switch token := tokenI.(type) {
- case xml.StartElement:
- switch prevToken.(type) {
- case xml.EndElement:
- writeBuffer.WriteString(", ")
- }
- parent := frame{}
- frame := frame{}
- if stackSize > 0 {
- parent = stack[stackSize-1]
- }
- tokenToProcess,_ = xmlDecoder.Token()
- switch tokenToProcess.(type) {
- case xml.StartElement:
- if prevToken != nil {
- writeBuffer.WriteString(fmt.Sprintf("\"%v\":", parent.TotalNested))
- }
- writeBuffer.WriteString("{")
- frame.HasNested = true
- }
- writeBuffer.WriteString("\""+token.Name.Local+"\":")
- frame.AttrCount = len(token.Attr)
- if frame.AttrCount > 0 {
- writeBuffer.WriteString("{")
- }
- for _, attr := range token.Attr {
- writeBuffer.WriteString("\"@"+attr.Name.Local+"\": \""+attr.Value+"\",")
- }
- if frame.AttrCount > 0 {
- writeBuffer.WriteString("\"value\":")
- }
- if frame.HasNested {
- writeBuffer.WriteString("{")
- }
- if stackSize > 0 {
- parent.TotalNested++
- stack[stackSize-1] = parent
- }
- stack = append(stack[:stackSize], frame)
- stackSize++
- case xml.CharData:
- switch prevToken.(type) {
- case xml.CharData:
- writeBuffer.WriteString(", ")
- }
- writeBuffer.WriteString("\""+trimNonGraphic(string(xml.CharData(token)))+"\"")
- case xml.EndElement:
- var frame frame
- if stackSize > 0 {
- stackSize--
- frame = stack[stackSize]
- } else {
- // TODO: process this error (unbalanced StartElements and EndElements)
- }
- if frame.HasNested {
- writeBuffer.WriteString(fmt.Sprintf(",\"#count\":\"%v\"}", frame.TotalNested))
- }
- if frame.AttrCount > 0 {
- writeBuffer.WriteString("}")
- }
- if frame.HasNested {
- writeBuffer.WriteString("}")
- }
- }
- if writeBuffer.Len() > 65535 {
- writer.Write(writeBuffer.Bytes())
- writeBuffer.Reset()
- }
- prevToken = tokenI
- }
- if writeBuffer.Len() > 0 {
- writer.Write(writeBuffer.Bytes())
- writeBuffer.Reset()
- }
- }
- func Run(reader io.Reader, writer io.Writer) {
- xmlDecoder := xml.NewDecoder(reader)
- xmlDecoder.CharsetReader = charset.NewReaderLabel
- run(xmlDecoder, reader, writer)
- return
- }
- // about trimNonGraphic():
- // Copied from: https://github.com/basgys/goxml2json/blob/5452b6625ea2d3b3133c6b3ace15084d97dcc810/decoder.go
- //
- // Copyright (c) 2016 Bastien Gysler
- //
- // trimNonGraphic returns a slice of the string s, with all leading and trailing
- // non graphic characters and spaces removed.
- //
- // Graphic characters include letters, marks, numbers, punctuation, symbols,
- // and spaces, from categories L, M, N, P, S, Zs.
- // Spacing characters are set by category Z and property Pattern_White_Space.
- func trimNonGraphic(s string) string {
- if s == "" {
- return s
- }
- var first *int
- var last int
- for i, r := range []rune(s) {
- if !unicode.IsGraphic(r) || unicode.IsSpace(r) {
- continue
- }
- if first == nil {
- f := i // copy i
- first = &f
- last = i
- } else {
- last = i
- }
- }
- // If first is nil, it means there are no graphic characters
- if first == nil {
- return ""
- }
- return string([]rune(s)[*first : last+1])
- }
|