Skip to content

slash3b/utfbom

Folders and files

NameName
Last commit message
Last commit date

Latest commit

 

History

79 Commits
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 

Repository files navigation

This is a fork of github.com/dimchansky/utfbom.

utfbom

Godoc License Go Report Card Build Status Sourcegraph

Package utfbom is able to detect and remove the Unicode Byte Order Mark (BOM) from input streams.

Installation

    go get -u github.com/slash3b/utfbom

What is \uFEFF?

\uFEFF is the Unicode Byte Order Mark (BOM), it indicates text encoding and byte order.
Go source code is defined to be UTF-8 text, so all string literals in Go source files are by default UTF-8 encoded sequences, making Go a UTF-8 compliant language at its core.
Note, BOM is a zero width character.

This library supports detection and trimming of the following BOM prefixes:

Encoding BOM Hex Values
UTF-8 0xef 0xbb 0xbf
UTF-16 (BE) 0xfe 0xff
UTF-16 (LE) 0xff 0xfe
UTF-32 (BE) 0x00 0x00 0xfe 0xff
UTF-32 (LE) 0xff 0xfe 0x00 0x00

go.dev/play

    package main

    import (
        "bytes"
        "encoding/hex"
        "fmt"
    )

    func main() {
        a := "\ufefehey"
        b := "hey"

        fmt.Println(a == b)
        fmt.Println(bytes.Equal([]byte(a), []byte(b)))

        fmt.Print(hex.Dump([]byte(a)))
        fmt.Print(hex.Dump([]byte(b)))
    }

    // Output:
    // false
    // false
    // 00000000  ef bb be 68 65 79                                 |...hey|
    // 00000000  68 65 79                                          |hey|

Links:

Examples

Encoding detection

go.dev/play

    package main

    import (
        "fmt"

        "github.com/slash3b/utfbom"
    )

    func main() {
        input := "\ufeffhey"
        fmt.Printf("input string: %q\n", input)
        fmt.Printf("input bytes: %#x\n", input)

        enc := utfbom.DetectEncoding(input)
        fmt.Printf("detected encoding: %s\n", enc)

        fmt.Printf("is UTF16:%v\n", enc.AnyOf(utfbom.UTF16BigEndian, utfbom.UTF16LittleEndian))
        fmt.Printf("is UTF8:%v\n", enc.AnyOf(utfbom.UTF8))
    }

    // Output: 
    // input string: "\ufeffhey"
    // input bytes: 0xefbbbf686579
    // detected encoding: UTF8
    // is UTF16:false
    // is UTF8:true

BOM trimming

go.dev/play

    package main

    import (
        "fmt"

        "github.com/slash3b/utfbom"
    )

    func main() {
        input := "\ufeffhey"
        fmt.Printf("input string: %q\n", input)
        fmt.Printf("input bytes: %#x\n", input)

		output, enc := utfbom.Trim(input)
		fmt.Printf("detected encoding: %s\n", enc)
		fmt.Printf("output string: %q\n", output)
		fmt.Printf("output bytes:%#x\n", output)
	}

    // Output: 
    // input string: "\ufeffhey"
    // input bytes: 0xefbbbf686579
    // detected encoding: UTF8
    // output string: "hey"
    // output bytes:0x686579

Reading CSV file with BOM:

go.dev/play

    package main

    import (
        "bytes"
        "encoding/csv"
        "encoding/hex"
        "fmt"
        "strings"

        "github.com/slash3b/utfbom"
    )

    func main() {
        csvFile := "\uFEFFIndex,Customer Id,First Name\n" +
            "1,DD37Cf93aecA6Dc,Sheryl"

        urd := utfbom.NewReader(bytes.NewReader([]byte(csvFile)))
        crd := csv.NewReader(urd)

        out := ""
        for {
            row, err := crd.Read()
            if err != nil {
                break
            }

            out += strings.Join(row, ",")
        }

        fmt.Println("detected encoding:", urd.Enc)
        fmt.Println("before")
        fmt.Println(hex.Dump([]byte(csvFile)))
        fmt.Println("after")
        fmt.Println(hex.Dump([]byte(out)))
    }

	// output:
	//detected encoding: UTF8
	//before
	//00000000  ef bb bf 49 6e 64 65 78  2c 43 75 73 74 6f 6d 65  |...Index,Custome|
	//00000010  72 20 49 64 2c 46 69 72  73 74 20 4e 61 6d 65 0a  |r Id,First Name.|
	//00000020  31 2c 44 44 33 37 43 66  39 33 61 65 63 41 36 44  |1,DD37Cf93aecA6D|
	//00000030  63 2c 53 68 65 72 79 6c                           |c,Sheryl|
	//
	//after
	//00000000  49 6e 64 65 78 2c 43 75  73 74 6f 6d 65 72 20 49  |Index,Customer I|
	//00000010  64 2c 46 69 72 73 74 20  4e 61 6d 65 31 2c 44 44  |d,First Name1,DD|
	//00000020  33 37 43 66 39 33 61 65  63 41 36 44 63 2c 53 68  |37Cf93aecA6Dc,Sh|
	//00000030  65 72 79 6c                                       |eryl|

About

A Go library for detecting, removing, and prepending Unicode Byte Order Marks (BOMs) in strings, byte slices, and io.Reader streams.

Topics

Resources

License

Stars

Watchers

Forks

Contributors