Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1,309 changes: 1,098 additions & 211 deletions Cargo.lock

Large diffs are not rendered by default.

15 changes: 8 additions & 7 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

[package]
name = "datafusion-python"
version = "20.0.0"
version = "21.0.0"
homepage = "https://github.com/apache/arrow-datafusion-python"
repository = "https://github.com/apache/arrow-datafusion-python"
authors = ["Apache Arrow <dev@arrow.apache.org>"]
Expand All @@ -35,19 +35,20 @@ default = ["mimalloc"]
tokio = { version = "1.24", features = ["macros", "rt", "rt-multi-thread", "sync"] }
rand = "0.8"
pyo3 = { version = "0.18.1", features = ["extension-module", "abi3", "abi3-py37"] }
datafusion = { version = "20.0.0", features = ["pyarrow", "avro"]}
datafusion-common = { version = "20.0.0", features = ["pyarrow"]}
datafusion-expr = { version = "20.0.0" }
datafusion-optimizer = { version = "20.0.0" }
datafusion-sql = { version = "20.0.0" }
datafusion-substrait = { version = "20.0.0" }
datafusion = { version = "21.0.0", features = ["pyarrow", "avro"]}
datafusion-common = { version = "21.0.0", features = ["pyarrow"]}
datafusion-expr = { version = "21.0.0" }
datafusion-optimizer = { version = "21.0.0" }
datafusion-sql = { version = "21.0.0" }
datafusion-substrait = { version = "21.0.0" }
uuid = { version = "1.2", features = ["v4"] }
mimalloc = { version = "0.1", optional = true, default-features = false }
async-trait = "0.1"
futures = "0.3"
object_store = { version = "0.5.3", features = ["aws", "gcp", "azure"] }
parking_lot = "0.12"
regex-syntax = "0.6.28"
url = "2.2"

[lib]
name = "datafusion_python"
Expand Down
7 changes: 4 additions & 3 deletions datafusion/tests/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -164,16 +164,17 @@ def test_join():
[pa.array([1, 2, 3]), pa.array([4, 5, 6])],
names=["a", "b"],
)
df = ctx.create_dataframe([[batch]])
df = ctx.create_dataframe([[batch]], "l")
Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I had to alias these tables to fix an ambiguous column reference later in the test ... seems like an improvement


batch = pa.RecordBatch.from_arrays(
[pa.array([1, 2]), pa.array([8, 10])],
names=["a", "c"],
)
df1 = ctx.create_dataframe([[batch]])
df1 = ctx.create_dataframe([[batch]], "r")

df = df.join(df1, join_keys=(["a"], ["a"]), how="inner")
df = df.sort(column("a").sort(ascending=True))
df.show()
df = df.sort(column("l.a").sort(ascending=True))
table = pa.Table.from_batches(df.collect())

expected = {"a": [1, 2], "c": [8, 10], "b": [4, 5]}
Expand Down
2 changes: 1 addition & 1 deletion datafusion/tests/test_indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ def test_err(df):
with pytest.raises(Exception) as e_info:
df["c"]

assert "Schema error: No field named 'c'" in e_info.value.args[0]
assert 'Schema error: No field named "c"' in e_info.value.args[0]

with pytest.raises(Exception) as e_info:
df[1]
Expand Down
2 changes: 1 addition & 1 deletion datafusion/tests/test_store.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ def local():
@pytest.fixture
def ctx(local):
ctx = SessionContext()
ctx.register_object_store("local", local, None)
ctx.register_object_store("file://local", local, None)
return ctx


Expand Down
15 changes: 10 additions & 5 deletions src/common/df_field.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
// under the License.

use datafusion::arrow::datatypes::DataType;
use datafusion_common::DFField;
use datafusion_common::{DFField, OwnedTableReference};
use pyo3::prelude::*;

use super::data_type::PyDataType;
Expand Down Expand Up @@ -46,9 +46,14 @@ impl From<DFField> for PyDFField {
impl PyDFField {
#[new]
#[pyo3(signature = (qualifier=None, name="", data_type=DataType::Int64.into(), nullable=false))]
fn new(qualifier: Option<&str>, name: &str, data_type: PyDataType, nullable: bool) -> Self {
fn new(qualifier: Option<String>, name: &str, data_type: PyDataType, nullable: bool) -> Self {
PyDFField {
field: DFField::new(qualifier, name, data_type.into(), nullable),
field: DFField::new(
qualifier.map(|q| OwnedTableReference::from(q)),
name,
data_type.into(),
nullable,
),
}
}

Expand Down Expand Up @@ -91,8 +96,8 @@ impl PyDFField {
// fn py_unqualified_column(&self) -> PyResult<PyColumn> {}

#[pyo3(name = "qualifier")]
fn py_qualifier(&self) -> PyResult<Option<&String>> {
Ok(self.field.qualifier())
fn py_qualifier(&self) -> PyResult<Option<String>> {
Ok(self.field.qualifier().map(|q| format!("{}", q)))
}

// TODO: Need bindings for Arrow `Field` first
Expand Down
18 changes: 8 additions & 10 deletions src/context.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ use std::path::PathBuf;
use std::sync::Arc;

use object_store::ObjectStore;
use url::Url;
use uuid::Uuid;

use pyo3::exceptions::{PyKeyError, PyValueError};
Expand Down Expand Up @@ -49,7 +50,6 @@ use datafusion::physical_plan::SendableRecordBatchStream;
use datafusion::prelude::{
AvroReadOptions, CsvReadOptions, DataFrame, NdJsonReadOptions, ParquetReadOptions,
};
use datafusion_common::config::Extensions;
use datafusion_common::ScalarValue;
use pyo3::types::PyTuple;
use tokio::runtime::Runtime;
Expand Down Expand Up @@ -260,10 +260,9 @@ impl PySessionContext {
} else {
&upstream_host
};

self.ctx
.runtime_env()
.register_object_store(scheme, derived_host, store);
let url_string = format!("{}{}", scheme, derived_host);
let url = Url::parse(&url_string).unwrap();
self.ctx.runtime_env().register_object_store(&url, store);
Ok(())
}

Expand Down Expand Up @@ -699,20 +698,19 @@ impl PySessionContext {
part: usize,
py: Python,
) -> PyResult<PyRecordBatchStream> {
let ctx = TaskContext::try_new(
"task_id".to_string(),
let ctx = TaskContext::new(
None,
"session_id".to_string(),
HashMap::new(),
SessionConfig::new(),
HashMap::new(),
HashMap::new(),
Arc::new(RuntimeEnv::default()),
Extensions::default(),
);
// create a Tokio runtime to run the async code
let rt = Runtime::new().unwrap();
let plan = plan.plan.clone();
let fut: JoinHandle<datafusion_common::Result<SendableRecordBatchStream>> =
rt.spawn(async move { plan.execute(part, Arc::new(ctx?)) });
rt.spawn(async move { plan.execute(part, Arc::new(ctx)) });
let stream = wait_for_future(py, fut).map_err(py_datafusion_err)?;
Ok(PyRecordBatchStream::new(stream?))
}
Expand Down
2 changes: 1 addition & 1 deletion src/dataframe.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ use crate::sql::logical::PyLogicalPlan;
use crate::utils::wait_for_future;
use crate::{errors::DataFusionError, expr::PyExpr};
use datafusion::arrow::datatypes::Schema;
use datafusion::arrow::pyarrow::{PyArrowConvert, PyArrowException, PyArrowType};
use datafusion::arrow::pyarrow::{PyArrowConvert, PyArrowType};
use datafusion::arrow::util::pretty;
use datafusion::dataframe::DataFrame;
use datafusion::prelude::*;
Expand Down
2 changes: 1 addition & 1 deletion src/expr/column.rs
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ impl PyColumn {

/// Get the column relation
fn relation(&self) -> Option<String> {
self.col.relation.clone()
self.col.relation.as_ref().map(|r| format!("{}", r))
}

/// Get the fully-qualified column name
Expand Down
4 changes: 2 additions & 2 deletions src/expr/table_scan.rs
Original file line number Diff line number Diff line change
Expand Up @@ -67,8 +67,8 @@ impl Display for PyTableScan {
impl PyTableScan {
/// Retrieves the name of the table represented by this `TableScan` instance
#[pyo3(name = "table_name")]
fn py_table_name(&self) -> PyResult<&str> {
Ok(&self.table_scan.table_name)
fn py_table_name(&self) -> PyResult<String> {
Ok(format!("{}", self.table_scan.table_name))
}

/// TODO: Bindings for `TableSource` need to exist first. Left as a
Expand Down