214 lines
7.0 KiB
Go
214 lines
7.0 KiB
Go
|
package sax
|
||
|
|
||
|
import (
|
||
|
"fmt"
|
||
|
"strings"
|
||
|
"sync"
|
||
|
"unsafe"
|
||
|
)
|
||
|
|
||
|
|
||
|
|
||
|
/*
|
||
|
#cgo pkg-config: libxml-2.0
|
||
|
#include <libxml/tree.h>
|
||
|
#include <libxml/parser.h>
|
||
|
#include <libxml/parserInternals.h>
|
||
|
|
||
|
extern void startDocumentCgo(void*);
|
||
|
extern void endDocumentCgo(void*);
|
||
|
extern void startElementCgo(void*, const xmlChar*, const xmlChar**);
|
||
|
extern void startElementNoAttrCgo(void*, const xmlChar*, const xmlChar**);
|
||
|
extern void endElementCgo(void*, const xmlChar*);
|
||
|
extern void charactersCgo(void*, const xmlChar*, int);
|
||
|
extern void charactersRawCgo(void*, const xmlChar*, int);
|
||
|
// Since this structure contains pointers, take extra care to zero it out
|
||
|
// before passing it to Go code.
|
||
|
static inline xmlSAXHandler newHandlerStruct() {
|
||
|
xmlSAXHandler h = {0};
|
||
|
return h;
|
||
|
}
|
||
|
// Wrap a C macro in a function callable from Go.
|
||
|
static inline xmlError* getLastError() {
|
||
|
return xmlGetLastError();
|
||
|
}
|
||
|
*/
|
||
|
import "C"
|
||
|
|
||
|
import "github.com/eliben/gosax/pointer"
|
||
|
|
||
|
// Used to ensure that xmlInitParser is only called once.
|
||
|
var initOnce sync.Once
|
||
|
|
||
|
func init() {
|
||
|
initOnce.Do(func() {
|
||
|
C.xmlInitParser()
|
||
|
})
|
||
|
}
|
||
|
|
||
|
// SaxCallbacks collects callback functions to invoke on SAX events. Only
|
||
|
// populate callbacks you're interested in - callbacks left as nil will not
|
||
|
// be registered with the C layer and may save processing time.
|
||
|
// Some callbacks override others for optimization purposes - check the comments
|
||
|
// for more information.
|
||
|
type SaxCallbacks struct {
|
||
|
// StartDocument is invoked on the "start document" event.
|
||
|
StartDocument StartDocumentFunc
|
||
|
|
||
|
// EndDocument is invoked on the "end document" event
|
||
|
EndDocument EndDocumentFunc
|
||
|
|
||
|
// StartElement is invoked whenever the beginning of a new element is found.
|
||
|
// name will be the element name, and attrs a slice of attributes where
|
||
|
// attribute names alternate with values. For example, given the element
|
||
|
// <elem foo="bar" id="100"> the callback will get name="elem" and
|
||
|
// attrs=["foo", "bar", "id", "100"].
|
||
|
StartElement StartElementFunc
|
||
|
|
||
|
// StartElementNoAttr will override StartElement, if set. When you don't
|
||
|
// care about the attributes of an element, use this one - it will be faster
|
||
|
// because it doesn't have to do attribute unpacking, which is expensive.
|
||
|
StartElementNoAttr StartElementNoAttrFunc
|
||
|
|
||
|
// EndElement is invoked at the end of parsing an element (after closing tag
|
||
|
// has been processed), with name being the element name.
|
||
|
EndElement EndElementFunc
|
||
|
|
||
|
// Characters is invoked on character data inside elements. contents is the
|
||
|
// data, as string. Note that this callback may be invoked multiple times
|
||
|
// within a single tag.
|
||
|
Characters CharactersFunc
|
||
|
|
||
|
// CharactersRaw will override Characters, if set. It doesn't translate XML
|
||
|
// data into a Go string, but leaves it as an opaque pair of (ch, chlen),
|
||
|
// which you could use UnpackString to convert to a string if needed. This
|
||
|
// could be a useful optimization if you're only occasionally interested in
|
||
|
// the contents of character data.
|
||
|
CharactersRaw CharactersRawFunc
|
||
|
}
|
||
|
|
||
|
type StartDocumentFunc func()
|
||
|
type EndDocumentFunc func()
|
||
|
type StartElementFunc func(name string, attrs []string)
|
||
|
type StartElementNoAttrFunc func(name string)
|
||
|
type EndElementFunc func(name string)
|
||
|
type CharactersFunc func(contents string)
|
||
|
type CharactersRawFunc func(ch unsafe.Pointer, chlen int)
|
||
|
|
||
|
// UnpackString unpacks the opaque ch, chlen pair (that some callbacks in
|
||
|
// this package may create) into a Go string.
|
||
|
func UnpackString(ch unsafe.Pointer, chlen int) string {
|
||
|
return C.GoStringN((*C.char)(ch), C.int(chlen))
|
||
|
}
|
||
|
|
||
|
// ParseFile parses an XML file with the given name using SAX, with cb as
|
||
|
// the callbacks. The file name is required, rather than a reader, because it
|
||
|
// gets passed directly to the C layer.
|
||
|
func ParseFile(filename string, cb SaxCallbacks) error {
|
||
|
var cfilename *C.char = C.CString(filename)
|
||
|
defer C.free(unsafe.Pointer(cfilename))
|
||
|
|
||
|
// newHandlerStruct zeroes out all the pointers; we assign only those that
|
||
|
// are passed as non-nil in SaxCallbacks.
|
||
|
SAXhandler := C.newHandlerStruct()
|
||
|
|
||
|
if cb.StartDocument != nil {
|
||
|
SAXhandler.startDocument = C.startDocumentSAXFunc(C.startDocumentCgo)
|
||
|
}
|
||
|
|
||
|
if cb.EndDocument != nil {
|
||
|
SAXhandler.endDocument = C.endDocumentSAXFunc(C.endDocumentCgo)
|
||
|
}
|
||
|
|
||
|
if cb.StartElement != nil {
|
||
|
SAXhandler.startElement = C.startElementSAXFunc(C.startElementCgo)
|
||
|
}
|
||
|
// StartElementNoAttr overrides StartElement
|
||
|
if cb.StartElementNoAttr != nil {
|
||
|
SAXhandler.startElement = C.startElementSAXFunc(C.startElementNoAttrCgo)
|
||
|
}
|
||
|
|
||
|
if cb.EndElement != nil {
|
||
|
SAXhandler.endElement = C.endElementSAXFunc(C.endElementCgo)
|
||
|
}
|
||
|
|
||
|
if cb.Characters != nil {
|
||
|
SAXhandler.characters = C.charactersSAXFunc(C.charactersCgo)
|
||
|
}
|
||
|
// CharactersRaw overrides Characters
|
||
|
if cb.CharactersRaw != nil {
|
||
|
SAXhandler.characters = C.charactersSAXFunc(C.charactersRawCgo)
|
||
|
}
|
||
|
|
||
|
// Pack the callbacks structure into an opaque unsafe.Pointer which we'll
|
||
|
// pass to C as user_data, and C will pass it back to our Go callbacks.
|
||
|
user_data := pointer.Save(&cb)
|
||
|
defer pointer.Unref(user_data)
|
||
|
|
||
|
rc := C.xmlSAXUserParseFile(&SAXhandler, user_data, cfilename)
|
||
|
if rc != 0 {
|
||
|
xmlErr := C.getLastError()
|
||
|
msg := strings.TrimSpace(C.GoString(xmlErr.message))
|
||
|
return fmt.Errorf("line %v: error: %v", xmlErr.line, msg)
|
||
|
}
|
||
|
|
||
|
return nil
|
||
|
}
|
||
|
|
||
|
//export goStartDocument
|
||
|
func goStartDocument(user_data unsafe.Pointer) {
|
||
|
gcb := pointer.Restore(user_data).(*SaxCallbacks)
|
||
|
gcb.StartDocument()
|
||
|
}
|
||
|
|
||
|
//export goEndDocument
|
||
|
func goEndDocument(user_data unsafe.Pointer) {
|
||
|
gcb := pointer.Restore(user_data).(*SaxCallbacks)
|
||
|
gcb.EndDocument()
|
||
|
}
|
||
|
|
||
|
//export goStartElement
|
||
|
func goStartElement(user_data unsafe.Pointer, name *C.char, attrs **C.char, attrlen C.int) {
|
||
|
// Passing attrs to Go is tricky because it's an array of C strings,
|
||
|
// terminated with a NULL pointer. The C callback startElementCgo calculates
|
||
|
// the length of the array and passes it in as attrlen. We still have to
|
||
|
// convert it to a Go slice, by mapping a slice on the underlying storage
|
||
|
// and copying the attributes, one by one. This is all rather expensive, so
|
||
|
// consider using the StartElementNoAttr callback instead, when applicable.
|
||
|
gcb := pointer.Restore(user_data).(*SaxCallbacks)
|
||
|
length := int(attrlen)
|
||
|
var goattrs []string
|
||
|
if length > 0 {
|
||
|
tmpslice := (*[1 << 30]*C.char)(unsafe.Pointer(attrs))[:length:length]
|
||
|
goattrs = make([]string, length)
|
||
|
for i, s := range tmpslice {
|
||
|
goattrs[i] = C.GoString(s)
|
||
|
}
|
||
|
}
|
||
|
gcb.StartElement(C.GoString(name), goattrs)
|
||
|
}
|
||
|
|
||
|
//export goStartElementNoAttr
|
||
|
func goStartElementNoAttr(user_data unsafe.Pointer, name *C.char) {
|
||
|
gcb := pointer.Restore(user_data).(*SaxCallbacks)
|
||
|
gcb.StartElementNoAttr(C.GoString(name))
|
||
|
}
|
||
|
|
||
|
//export goEndElement
|
||
|
func goEndElement(user_data unsafe.Pointer, name *C.char) {
|
||
|
gcb := pointer.Restore(user_data).(*SaxCallbacks)
|
||
|
gcb.EndElement(C.GoString(name))
|
||
|
}
|
||
|
|
||
|
//export goCharacters
|
||
|
func goCharacters(user_data unsafe.Pointer, ch *C.char, chlen C.int) {
|
||
|
gcb := pointer.Restore(user_data).(*SaxCallbacks)
|
||
|
gcb.Characters(C.GoStringN(ch, chlen))
|
||
|
}
|
||
|
|
||
|
//export goCharactersRaw
|
||
|
func goCharactersRaw(user_data unsafe.Pointer, ch *C.char, chlen C.int) {
|
||
|
gcb := pointer.Restore(user_data).(*SaxCallbacks)
|
||
|
gcb.CharactersRaw(unsafe.Pointer(ch), int(chlen))
|
||
|
}
|