diff --git a/.gitignore b/.gitignore index 04b7ce2..df1aa4b 100644 --- a/.gitignore +++ b/.gitignore @@ -47,4 +47,6 @@ fabric.properties # Don't include the full snapshot ZIP since it's massive. kiva_ds_json.zip -*.json \ No newline at end of file +*.json +spark-*/ +*.swp diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..8c10745 --- /dev/null +++ b/requirements.txt @@ -0,0 +1 @@ +py4j>=0.10.4 diff --git a/start_pyspark.sh b/start_pyspark.sh new file mode 100755 index 0000000..dd8e2d8 --- /dev/null +++ b/start_pyspark.sh @@ -0,0 +1,9 @@ +SPARK_DIR="spark-2.0.1-bin-hadoop2.7" +SPARK_URL="http://apache.claz.org/spark/spark-2.0.1/spark-2.0.1-bin-hadoop2.7.tgz" + +if [ ! -d "$SPARK_DIR" ]; then + echo "Downloading Spark distribution..." + wget "$SPARK_URL" -O - | tar xzf - +fi + +PYSPARK_DRIVER_PYTHON="jupyter" PYSPARK_DRIVER_PYTHON_OPTS="notebook" "./$SPARK_DIR/bin/pyspark"