Documentation
¶
Index ¶
- func StandardFunctions() d.Fns
- type Col
- type DF
- func (f *DF) AllRows() iter.Seq2[int, []any]
- func (f *DF) AppendColumn(col d.Column, replace bool) error
- func (f *DF) AppendDF(df d.DF) (d.DF, error)
- func (f *DF) By(groupBy string, fns ...string) (d.DF, error)
- func (f *DF) Categorical(colName string, catMap d.CategoryMap, fuzz int, defaultVal any, levels []any) (d.Column, error)
- func (f *DF) Copy() d.DF
- func (f *DF) Interp(points d.HasIter, xSfield, xIfield, yfield, outField string) (d.DF, error)
- func (f *DF) Join(df d.HasIter, joinOn string) (d.DF, error)
- func (f *DF) Len() int
- func (f *DF) Less(i, j int) bool
- func (f *DF) Row(rowNum int) []any
- func (f *DF) RowCount() int
- func (f *DF) SetParent() error
- func (f *DF) Sort(ascending bool, sortCols string) error
- func (f *DF) SourceQuery() string
- func (f *DF) String() string
- func (f *DF) Swap(i, j int)
- func (f *DF) Table(cols string) (d.DF, error)
- func (f *DF) Where(condition string) (d.DF, error)
Examples ¶
Constants ¶
This section is empty.
Variables ¶
This section is empty.
Functions ¶
func StandardFunctions ¶
StandardFunctions returns the built-in functions for in-memory data to be used by Parser.
Types ¶
type Col ¶
Col implements Column for in-memory data.
func NewCol ¶
NewCol creates a new mem.Column from data.
Example ¶
Create a new column from a *Vector.
const n = 100
x := make([]int64, n)
for ind := range n {
x[ind] = int64(ind * 2)
}
var (
v *d.Vector
e1 error
)
// NewVector will convert the type.
if v, e1 = d.NewVector(x, d.DTint); e1 != nil {
panic(e1)
}
// Note, calling NewCol with x will generate an error since x is not of type int
// and NewCol does not convert types.
var (
col *Col
e2 error
)
if col, e2 = NewCol(v, d.ColName("x")); e2 != nil {
panic(e2)
}
fmt.Println(col.AsAny().([]int)[0:10])
Output: [0 2 4 6 8 10 12 14 16 18]
type DF ¶
DF implements DF for in-memory data.
func DBload ¶
DBload loads a *DF from a query.
Example ¶
Connect to ClickHouse and pull the data from a query. Note that this code is identical to the DBload example in df/sql. The mem/df package loads the data into memory, the sql/df package does not.
const (
dbProvider = "clickhouse"
chTable = "testing.d1"
)
// ClickHouse connection parameters.
user := os.Getenv("user")
host := os.Getenv("host")
password := os.Getenv("password")
db := newConnectCH(host, user, password)
qry := "SELECT k, x FROM " + chTable
var (
dlct *d.Dialect
e error
)
if dlct, e = d.NewDialect(dbProvider, db); e != nil {
panic(e)
}
var (
df *DF
e1 error
)
if df, e1 = DBload(qry, dlct); e1 != nil {
panic(e1)
}
fmt.Println("# of Rows: ", df.RowCount())
fmt.Println("Columns: ", df.ColumnNames())
Output: # of Rows: 6 Columns: [k x]
func FileLoad ¶
*FileLoad loads a *DF from a *d.Files struct.
Example ¶
Load a CSV with a header. Column types are determined by peeking at the data.
var (
f *d.Files
e1 error
)
if f, e1 = d.NewFiles(d.FileStrict(true)); e1 != nil {
panic(e1)
}
// this file is in df/data.
fileToOpen := os.Getenv("datapath") + "d1.csv"
if ex := f.Open(fileToOpen); ex != nil {
panic(ex)
}
var (
df *DF
e2 error
)
if df, e2 = FileLoad(f); e2 != nil {
panic(e2)
}
fmt.Println("# of Rows: ", df.RowCount())
fmt.Println("Columns: ", df.ColumnNames())
Output: # of Rows: 6 Columns: [k x y yy z dt R]
Example (Types) ¶
Load a CSV with a header. Column names & types are specified by user. The source .CSV has a header, which is skipped. Note, if you specify types, you must also specify names.
// ordered as in the file
fieldNames := []string{"k", "x", "y", "yy", "z", "dt", "RNew"}
fieldTypes := []d.DataTypes{d.DTint, d.DTfloat, d.DTint, d.DTint, d.DTstring, d.DTdate, d.DTfloat}
var (
f *d.Files
e1 error
)
if f, e1 = d.NewFiles(d.FileFieldNames(fieldNames), d.FileFieldTypes(fieldTypes)); e1 != nil {
panic(e1)
}
fileToOpen := os.Getenv("datapath") + "d1.csv"
if ex := f.Open(fileToOpen); ex != nil {
panic(ex)
}
var (
df *DF
e2 error
)
if df, e2 = FileLoad(f); e2 != nil {
panic(e2)
}
ct, _ := df.ColumnTypes()
fmt.Println(ct)
Output: [DTint DTfloat DTint DTint DTstring DTdate DTfloat]
func NewDF ¶
NewDF creates a *DF from input.
input - can be (in order of what's tried) - *DF. This is copied. - *Col. This is copied. - d.Column. - *Vector. - HasMQdlct. The query is run to fetch the data. - d.DF. The data is pulled to construct the output.
func NewDFcol ¶
NewDFcol creates a DF from *mem.Col.
Example ¶
Create columns from slices and then create a new dataframe from them
const n = 100
x := make([]int, n)
y := make([]float64, n)
for ind := range n {
x[ind] = ind * 2
y[ind] = float64(x[ind])
}
var (
col1, col2 *Col
e1 error
)
if col1, e1 = NewCol(x, d.ColName("x")); e1 != nil {
panic(e1)
}
if col2, e1 = NewCol(x, d.ColName("y")); e1 != nil {
panic(e1)
}
var (
df *DF
e2 error
)
if df, e2 = NewDFcol([]*Col{col1, col2}); e2 != nil {
panic(e2)
}
var (
xf []float64
e3 error
)
// This will convert x to a float64.
if xf, e3 = df.Column("x").Data().AsFloat(); e3 != nil {
panic(e3)
}
fmt.Println(xf[0:10])
Output: [0 2 4 6 8 10 12 14 16 18]
func NewDFseq ¶
NewDFseq creates a *DF with a single column, name. That column is a DTint sequence from 0 to n-1.
func (*DF) AllRows ¶
AllRows iterates through the rows of the column. It returns the row # and the values of f that row.
func (*DF) AppendColumn ¶
AppendColumn masks the DFcore version so that we can handle appending scalars
Example ¶
Append a column to a dataframe
const (
n = 100
slen = 4
)
var (
df *DF
e1 error
)
if df, e1 = NewDFseq(n, "seq"); e1 != nil {
panic(e1)
}
x := make([]string, n)
for ind := range n {
x[ind] = d.RandomLetters(4)
}
// create a column named "x" from x.
var (
col *Col
e2 error
)
if col, e2 = NewCol(x, d.ColName("x")); e2 != nil {
panic(e2)
}
if e := df.AppendColumn(col, false); e != nil {
panic(e)
}
fmt.Println(df.ColumnNames())
Output: [seq x]
func (*DF) By ¶
By creates a new *DF with function fns calculated within the groups defined by groupBy.
groupBy - comma-separated list of fields to group on. If groupBy is empty, then the output will have 1 row. fns - functions to calculate on the By groups.
Example ¶
Create a new table grouping one one column with two summary columns.
const n = 1000
// create source dataframe.
x := make([]int, n)
y := make([]float64, n)
for ind := range n {
x[ind] = ind % 4
y[ind] = float64(ind)
}
var (
cx, cy *Col
e0 error
)
if cx, e0 = NewCol(x, d.ColName("x")); e0 != nil {
panic(e0)
}
if cy, e0 = NewCol(y, d.ColName("y")); e0 != nil {
panic(e0)
}
var (
df *DF
e1 error
)
if df, e1 = NewDFcol([]*Col{cx, cy}); e1 != nil {
panic(e1)
}
var (
dfBy d.DF
e2 error
)
// produce a summary
if dfBy, e2 = df.By("x", "my := mean(y)", "sy := sum(y)"); e2 != nil {
panic(e2)
}
if e := dfBy.Sort(true, "x"); e != nil {
panic(e)
}
fmt.Println(dfBy.Column("x").Data().AsAny())
fmt.Println(dfBy.Column("my").Data().AsAny())
fmt.Println(dfBy.Column("sy").Data().AsAny())
Output: [0 1 2 3] [498 499 500 501] [124500 124750 125000 125250]
Example (Global) ¶
Create a summary table that requires a global summary in the calculation.
const n = 1000
// create source dataframe.
x := make([]int, n)
y := make([]float64, n)
for ind := range n {
x[ind] = ind % 4
y[ind] = float64(ind)
}
var (
cx, cy *Col
e0 error
)
if cx, e0 = NewCol(x, d.ColName("x")); e0 != nil {
panic(e0)
}
if cy, e0 = NewCol(y, d.ColName("y")); e0 != nil {
panic(e0)
}
var (
df *DF
e1 error
)
if df, e1 = NewDFcol([]*Col{cx, cy}); e1 != nil {
panic(e1)
}
var (
dfBy d.DF
e2 error
)
// produce a summary
if dfBy, e2 = df.By("x", "cnt := count(x)", "total := count(global(x))", "prop := 100.0 * float(cnt)/float(total)"); e2 != nil {
panic(e2)
}
// if dfBy, e2 = df.By("x", "cnt := count(x)", "prop := float(cnt)/float(count(global(x)))"); e2 != nil {
// panic(e2)
// }
if e := dfBy.Sort(true, "x"); e != nil {
panic(e)
}
fmt.Println(dfBy.Column("x").Data().AsAny())
fmt.Println(dfBy.Column("cnt").Data().AsAny())
fmt.Println(dfBy.Column("total").Data().AsAny())
fmt.Println(dfBy.Column("prop").Data().AsAny())
//
Output: [0 1 2 3] [250 250 250 250] [1000 1000 1000 1000] [25 25 25 25]
Example (OneRow) ¶
Create a summary with no grouping column.
const n = 1000
// create source dataframe.
x := make([]int, n)
y := make([]float64, n)
for ind := range n {
x[ind] = ind % 4
y[ind] = float64(ind)
}
var (
cx, cy *Col
e0 error
)
if cx, e0 = NewCol(x, d.ColName("x")); e0 != nil {
panic(e0)
}
if cy, e0 = NewCol(y, d.ColName("y")); e0 != nil {
panic(e0)
}
var (
df *DF
e1 error
)
if df, e1 = NewDFcol([]*Col{cx, cy}); e1 != nil {
panic(e1)
}
var (
dfBy d.DF
e2 error
)
// produce a summary
if dfBy, e2 = df.By("", "cnt := count(y)", "sy := sum(y)"); e2 != nil {
panic(e2)
}
fmt.Println(dfBy.Column("cnt").Data().AsAny())
fmt.Println(dfBy.Column("sy").Data().AsAny())
Output: [1000] [499500]
Example (TwoColumns) ¶
Create a new table grouping on two columns with two summary columns.
const n = 1000
// create source dataframe.
x := make([]int, n)
r := make([]int, n)
y := make([]float64, n)
for ind := range n {
x[ind] = ind % 4
r[ind] = ind % 8
y[ind] = float64(ind)
}
var (
cx, cr, cy *Col
e0 error
)
if cx, e0 = NewCol(x, d.ColName("x")); e0 != nil {
panic(e0)
}
if cr, e0 = NewCol(r, d.ColName("r")); e0 != nil {
panic(e0)
}
if cy, e0 = NewCol(y, d.ColName("y")); e0 != nil {
panic(e0)
}
var (
df *DF
e1 error
)
if df, e1 = NewDFcol([]*Col{cx, cr, cy}); e1 != nil {
panic(e1)
}
var (
dfBy d.DF
e2 error
)
// produce a summary
if dfBy, e2 = df.By("x,r", "my := mean(y)", "sy := sum(y)"); e2 != nil {
panic(e2)
}
if e := dfBy.Sort(true, "x,r"); e != nil {
panic(e)
}
fmt.Println(dfBy.Column("x").Data().AsAny())
fmt.Println(dfBy.Column("r").Data().AsAny())
fmt.Println(dfBy.Column("my").Data().AsAny())
fmt.Println(dfBy.Column("sy").Data().AsAny())
Output: [0 0 1 1 2 2 3 3] [0 4 1 5 2 6 3 7] [496 500 497 501 498 502 499 503] [62000 62500 62125 62625 62250 62750 62375 62875]
func (*DF) Categorical ¶
func (f *DF) Categorical(colName string, catMap d.CategoryMap, fuzz int, defaultVal any, levels []any) (d.Column, error)
Categorical produces a categorical column from a source column.
colName - name of the source column catMap - optionally supply a category map of source value -> category level fuzz - if a source column value has counts < fuzz, then it is put in the 'other' category. defaultVal - optional source column value for the 'other' category. levels - slice of source values to make categories from
func (*DF) Interp ¶
Interp interpolates the columns (xIfield,yfield) at xsField points.
points - input iterator (e.g. Column or DF) that yields the points to interpolate at xSfield - column name of x values in source DF xIfield - name of x values in iDF yfield - column name of y values in source DF outField - column name of interpolated y's in return DF
The output DF is restricted to interpolated points that lie within the data. It has columns:
xIfield - points at which to interpolate. This may be a subset of the input "points". outField - interpolated values.
Example ¶
const n1 = 10
// create first dataframe.
x := make([]float64, n1)
y := make([]float64, n1)
for ind := range n1 {
x[ind] = float64(ind)
y[ind] = float64(ind) * 4
}
var (
cx1, cy1 *Col
e0 error
)
if cx1, e0 = NewCol(x, d.ColName("x")); e0 != nil {
panic(e0)
}
if cy1, e0 = NewCol(y, d.ColName("y")); e0 != nil {
panic(e0)
}
var (
df1 *DF
e1 error
)
if df1, e1 = NewDFcol([]*Col{cx1, cy1}); e1 != nil {
panic(e1)
}
cxi := []float64{0.5, 4.25, -1, 20, 6.8}
coli, _ := NewCol(cxi, d.ColName("xi"))
dfOut, _ := df1.Interp(coli, "x", "xi", "y", "yInterp")
fmt.Println(dfOut.Column("yInterp").Data().AsAny())
Output: [2 17 27.2]
func (*DF) Join ¶
Join joins f and df on the columns of joinOn. This is an inner join.
df - data to join. joinOn - comma-separated list of fields to join on. These fields must have the same name in both data sets.
Example ¶
const (
n1 = 10
n2 = 15
)
// create first dataframe.
x := make([]int, n1)
y := make([]float64, n1)
for ind := range n1 {
x[ind] = ind
y[ind] = float64(ind) * 4
}
var (
cx1, cy1 *Col
e0 error
)
if cx1, e0 = NewCol(x, d.ColName("x")); e0 != nil {
panic(e0)
}
if cy1, e0 = NewCol(y, d.ColName("y")); e0 != nil {
panic(e0)
}
var (
df1 *DF
e1 error
)
if df1, e1 = NewDFcol([]*Col{cx1, cy1}); e1 != nil {
panic(e1)
}
// create second dataframe.
x = make([]int, n2)
z := make([]float64, n2)
for ind := range n2 {
x[ind] = ind
z[ind] = -float64(ind) * 4
}
var (
cx2, cz2 *Col
e2 error
)
if cx2, e2 = NewCol(x, d.ColName("x")); e2 != nil {
panic(e2)
}
if cz2, e2 = NewCol(z, d.ColName("z")); e2 != nil {
panic(e2)
}
var (
df2 *DF
e3 error
)
if df2, e3 = NewDFcol([]*Col{cx2, cz2}); e3 != nil {
panic(e3)
}
var (
dfJoin d.DF
e4 error
)
if dfJoin, e4 = df1.Join(df2, "x"); e4 != nil {
panic(e4)
}
fmt.Println(dfJoin.Column("x").Data().AsAny())
fmt.Println(dfJoin.Column("y").Data().AsAny())
fmt.Println(dfJoin.Column("z").Data().AsAny())
Output: [0 1 2 3 4 5 6 7 8 9] [0 4 8 12 16 20 24 28 32 36] [-0 -4 -8 -12 -16 -20 -24 -28 -32 -36]
Example (TwoColumns) ¶
Join based on two columns. Compare to the same example under df/sql.
const (
nLeft = 10
nRight = 15
dbProvider = "clickhouse"
)
var (
dfLeft, dfRight d.DF
e1 error
)
if dfLeft, e1 = NewDFseq(nLeft, "seq"); e1 != nil {
panic(e1)
}
if dfRight, e1 = NewDFseq(nRight, "seq"); e1 != nil {
panic(e1)
}
// second column to join on
if e := d.Parse(dfLeft, "b := if(mod(seq,4) == 0, 'a', if(mod(seq,4)==1, 'b', if(mod(seq,4)==2, 'c', 'd')))"); e != nil {
panic(e)
}
if e := d.Parse(dfRight, "b := if(mod(seq,4) == 0, 'a', 'b')"); e != nil {
panic(e)
}
// add another column to each
if e := d.Parse(dfLeft, "x := exp(float(seq) / 100.0)"); e != nil {
panic(e)
}
if e := d.Parse(dfRight, "y := seq^2"); e != nil {
panic(e)
}
var (
dfJoin d.DF
e2 error
)
if dfJoin, e2 = dfLeft.Join(dfRight, "seq,b"); e2 != nil {
panic(e2)
}
fmt.Println(dfJoin.RowCount())
fmt.Println(dfJoin.Column("seq").Data().AsAny())
fmt.Println(dfJoin.Column("b").Data().AsAny())
fmt.Println(dfJoin.Column("y").Data().AsAny())
Output: 6 [0 1 4 5 8 9] [a b a b a b] [0 1 16 25 64 81]
func (*DF) Sort ¶
Sort sorts f according to sortCols. ascending - true = sort ascending sortCols - comma-separated list of columns to sort on.
func (*DF) SourceQuery ¶
SourceQuery returns the query used to load f, if any.
func (*DF) Table ¶
Table produces a table based on cols. cols is a comma-separated list of fields. The metrics within each group calculated are:
n - count of rows rate - fraction of original row count.
func (*DF) Where ¶
Where subsets f to rows where condition is true.
Example ¶
const n1 = 10
// create dataframe.
x := make([]int, n1)
y := make([]float64, n1)
for ind := range n1 {
x[ind] = ind
y[ind] = float64(ind) * 4
}
var (
cx1, cy1 *Col
e0 error
)
if cx1, e0 = NewCol(x, d.ColName("x")); e0 != nil {
panic(e0)
}
if cy1, e0 = NewCol(y, d.ColName("y")); e0 != nil {
panic(e0)
}
var (
df1 *DF
e1 error
)
if df1, e1 = NewDFcol([]*Col{cx1, cy1}); e1 != nil {
panic(e1)
}
// subset to where x < 4 or x > 8
dfOut, _ := df1.Where("x < 4 || x > 8")
fmt.Println(dfOut.Column("x").Data().AsAny())
Output: [0 1 2 3 9]