Golang. UTF-8 String from unsafe pointer

By | January 2, 2023

It is a continuation of previous post “Compare storage of string in memory for c++ and golang using dgb” where gbg debugger was used to read characters defined in string variable directly from memory. The example below shows how to read string bytes from memory using unsafe pointer. String encoding is UTF-8, and it contains Latin and Cyrillic letters, In UTF-8 1 Latin character uses 1 byte, but Cyrillic one has 2 byte representation.
It is the code:

package main

import (
    "fmt"
    "unsafe"
    "unicode/utf8"
)

func main() {
    type StringHeader struct {
        Data unsafe.Pointer 
        Len int
    }
    str :=  "Hello Кириллица" // Hello Cyrillic
    rCnt := utf8.RuneCountInString(str) // Number of letters 
    sLen := len(str) // Number of bytes
    fmt.Printf("%s (Length in bytes=%d, Number of runes= %d)\n", str, sLen, rCnt)
    var ptrStr = (*StringHeader)(unsafe.Pointer(&str))
    var dataStr = ptrStr.Data
    var lenStr = ptrStr.Len
    fmt.Printf("Pointer to character sequence: 0x%x\n",dataStr)
    fmt.Printf("Number of bytes: %d\n", lenStr)
    index := 0
    var char byte
    var wchar  uint16
    var byteArr []byte
    fmt.Printf("From unsafe pointer: ");
    for i := 0; i<rCnt; i++ {
        ptr := uintptr(dataStr) + uintptr(index) 
        char = *(*byte)(unsafe.Pointer(ptr))
        index++
        if (char & 0xd0)  == 0xd0 { 
           wchar = *(*uint16)(unsafe.Pointer(ptr))
           byteArr = append(byteArr, char)
           byteArr = append(byteArr,  byte(wchar / 0x100))
           fmt.Printf("%s",  string(byteArr))
           byteArr = nil
           index++
        } else {  
           fmt.Printf("%c", char)
        }
    }
    fmt.Printf("\n")
}
 

Compilation:


go build strunsafe.go

Execution:


./strunsafe
Hello Кириллица (Length in bytes=24, Number of runes= 15)
Pointer to character sequence: 0x4c21d5
Number of bytes: 24
From unsafe pointer: Hello Кириллица

Leave a Reply

Your email address will not be published. Required fields are marked *