spark手把手:[e2-spk-s02]
TRANSCRIPT
1
2 . 1
2 . 2
dockerrun-v/home/etadm/docker/spark/e2spkv01:/home:rw\-p8088:8088-p8042:8042\-hsandbox\-itsequenceiq/spark:1.6.0bash
2 . 5
spark-shell\--masterlocal\--jars/home/e2-spk-s02/jars/spark-csv_2.10-1.4.0.jar\,/home/e2-spk-s02/jars/commons-csv-1.1.jar
2 . 6
3
3
3
3
3
3
4 . 1
4 . 2
4 . 2
4 . 2
4 . 2
4 . 2
4 . 2
4 . 2
4 . 2
4 . 3
4 . 3
4 . 3
4 . 3
4 . 4
4 . 4
4 . 4
4 . 5
4 . 6
4 . 7
4 . 8
4 . 9
4 . 10
4 . 11
4 . 12
4 . 13
4 . 14
4 . 15
4 . 16
4 . 17
4 . 18
4 . 19
4 . 19
5 . 1
5 . 2
//CreateaobjectcontainercaseclassWord(text:String)valfileName="README.md"valdocs=sc.textFile(fileName)vallower=docs.map(line=>line.toLowerCase())valwords=lower.flatMap(line=>line.split("\\s+"))//ConvertRDDtoDataframeusing"Caseclass"valwords_df=words.map(Word(_)).toDF()words_df.registerTempTable("words")//Registerasa[TABLE]valtopWords=sqlContext.sql("SELECTtext,count(text)ASnFROMwordsGROUPBYtextORDERBYnDESCLIMIT10")topWords.foreach(println)
5 . 3
5 . 4
5 . 5
5 . 6
5 . 7
5 . 8
5 . 9
5 . 10
5 . 11
5 . 12
5 . 13
5 . 14
5 . 15
5 . 16
5 . 17
5 . 18
caseclassWord(text:String)valfileName="README.md"valdocs=sc.textFile(fileName)vallower=docs.map(line=>line.toLowerCase())valwords=lower.flatMap(line=>line.split("\\s+"))valwords_df=words.map(Word(_)).toDF()words_df.registerTempTable("words")valtopWords=sqlContext.sql("SELECTtext,count(text)ASnFROMwordsGROUPBYtextORDERBYnDESCLIMIT10"topWords.foreach(println)
5 . 19
5 . 20
6 . 1
6 . 2
6 . 2
6 . 2
6 . 3
6 . 4
6 . 5
6 . 6
6 . 6
6 . 6
6 . 6
6 . 6
6 . 6
6 . 7
6 . 7
6 . 7
6 . 7
6 . 7
6 . 7
6 . 8
6 . 8
6 . 8
6 . 9
6 . 9
6 . 9
6 . 9
6 . 9
6 . 9
7 . 1
7 . 3
7 . 4
7 . 4
importorg.apache.spark.sql.SQLContextimportorg.apache.spark.sql.functions._valsc=newSparkContext(conf)// SparkSQL DataFrame, SQLContextvalsqlContext=newSQLContext(sc) // RDD DataFrameimportsqlContext.implicits._
7 . 4
7 . 5
7 . 5
7 . 5
// Parquet DataFramevaldf=sqlContext.read.parquet("people.parquet") // DataFrame stdout//DisplaysthecontentoftheDataFrametostdoutdf.show()
7 . 5
7 . 6
7 . 6
// Parquet DataFramevaldf=sqlContext.read.parquet("people.parquet") // DataFrame stdoutdf.show()// Schemadf.printSchema()// "name"df.select("name").show()// "age" +1df.select(df("name"),df("age")+1).show()// 21 peopledf.filter(df("age")>21).show()// age countdf.groupBy("age").count().show()
7 . 6
7 . 7
7 . 7
// Parquet DataFramevaldf=sqlContext.read.parquet("people.parquet")//df.registerTempTable("people")// DataFrame stdoutsqlContext.sql("SELECT*FROMpeople").show()// SchemasqlContext.sql("SELECT*FROMpeople").printSchema()// "name"sqlContext.sql("SELECTnameFROMpeople").show()// "age" +1sqlContext.sql("SELECTname,(age+1)asageFROMpeople").show()// 21 peoplesqlContext.sql("SELECT*FROMpeopleWHEREage>21").show()// age countsqlContext.sql("SELECTage,count(age)ascountFROMpeopleGroupByage").show()
7 . 7
7 . 8
7 . 8
7 . 8
7 . 8
// caseclass SchemacaseclassPerson(name:String,age:Int)// DataFramevaldf=sc.textFile("people.txt").map(_.split(",")).map(p=>Person(p(0),p(1).trim.toInt)).toDF() df.registerTempTable("people")valteenagers=sqlContext.sql("SELECTname,ageFROMpeopleWHEREage>=13ANDage<=19") //SQLquery DataFrame, normalRDD operationteenagers.map(t=>"Name:"+t(0)).collect().foreach(println)//teenagers.map(t=>"Name:"+t.getAs[String]("name")).collect().foreach(println) //row.getValueMap[T] Map[String,T]teenagers.map(_.getValuesMap[Any](List("name","age"))).collect().foreach(println)
7 . 8
7 . 9
7 . 9
7 . 9
// , "parquet" Sparkvaldf=sqlContext.read.load("users.parquet") // DataFrame "parquet"df.select("name","favorite_color").write.save("namesAndFavColors.parquet")
7 . 9
// , "parquet" Sparkvaldf=sqlContext.read.load("users.parquet") // DataFrame "parquet"df.select("name","favorite_color").write.save("namesAndFavColors.parquet")
7 . 9
7 . 10
7 . 10
7 . 10
7 . 10
7 . 10
7 . 10
7 . 10
7 . 10
7 . 10
// , "parquet" Sparkvaldf=sqlContext.read.format("json").load("people.json")// DataFrame "parquet"df.select("name","age").write.save("namesAndAges.parquet")
7 . 10
7 . 11
7 . 11
7 . 11
7 . 11
7 . 11
7 . 11
7 . 11
importorg.apache.spark.sql.SaveMode// , "parquet" Sparkvaldf=sqlContext.read.load("users.parquet") // DataFrame "parquet" ( SaveMode.Overwrite)df.select("name","favorite_color").write.mode(SaveMode.Overwrite).save("namesAndFavColors.parquet")
7 . 11
8 . 1
8 . 3
//definetheschemausingacaseclasscaseclassAuction(auctionid:String,bid:Float,bidtime:Float,bidder:String,bidderrate:// ebay auctionvalebayText=sc.textFile("ebay.csv")// Auctionvalebay=ebayText.map(_.split(",")).map(p=>Auction(p(0),p(1).toFloat,p(2).toFloat,p(3),p(// DataFramevalauction=ebay.toDF()auction.registerTempTable("auction")// ?valcount=auction.select("auctionid").distinct.countSystem.out.println(count)// (item)valresults=sqlContext.sql("SELECTauctionid,item,count(bid)asbid_countFROMauctionGROUPBYauctionid,item"results.show()// ( / / )valresults2=sqlContext.sql("SELECTauctionid,MAX(price)asprice_max,MIN(price)asprice_min,AVG(price)asprice_avgFROMauctionGROUPBYitem,auctionid"
8 . 4
8 . 5
importcom.databricks.spark.csv// 3rdpartylibrary "CSV" Dataframevaldf=sqlContext.read.format("com.databricks.spark.csv").option("header","true")//Usefirstlineofallfilesasheader.option("inferSchema","true")//Automaticallyinferdatatypes.load("sfpd.csv")// Schemadf.printSchema// Distinct Categorydf.select("Category").distinct().collect().foreach(println)// temptabledf.registerTempTable("sfpd")//sqlContext.sql("SELECTdistinctCategoryFROMsfpd").collect().foreach(println)// Top10sqlContext.sql("SELECTResolution,count(Resolution)asrescountFROMsfpdgroupbyResolutionorderbyrescountdesclimit10"// Top10sqlContext.sql("SELECTCategory,count(Category)ascatcountFROMsfpdgroupbyCategoryorderbycatcountdesclimit10"
8 . 6
9