SparkSQL内置函数 -- countDistinct

[root@centos00 ~]$ cd hadoop-2.6.0-cdh5.14.2/
[root@centos00 hadoop-2.6.0-cdh5.14.2]$ sbin/hadoop-daemon.sh start namenode
[root@centos00 hadoop-2.6.0-cdh5.14.2]$ sbin/hadoop-daemon.sh start datanode
[root@centos00 hadoop-2.6.0-cdh5.14.2]$ sbin/yarn-daemon.sh start resourcemanager
  
[root@centos00 ~]$ cd /opt/cdh5.14.2/hive-1.1.0-cdh5.14.2/
[root@centos00 hive-1.1.0-cdh5.14.2]$ bin/hive --service metastore &
  
[root@centos00 ~]$ cd /opt/cdh5.14.2/spark-2.2.1-cdh5.14.2/
[root@centos00 spark-2.2.1-cdh5.14.2]$ sbin/start-master.sh
[root@centos00 spark-2.2.1-cdh5.14.2]$ sbin/start-slaves.sh

scala> import org.apache.spark.sql.functions._
import org.apache.spark.sql.functions._

scala> val arr = Array(("a", "20"), ("a", "30"), ("b", "20"), ("a", "20"))
arr: Array[(String, String)] = Array((a,20), (a,30), (b,20), (a,20))

scala> val df = sc.parallelize(arr).toDF("id", "age")
df: org.apache.spark.sql.DataFrame = [id: string, age: string]

scala> df.show(false)
+---+---+
|id |age|
+---+---+
|a  |20 |
|a  |30 |
|b  |20 |
|a  |20 |
+---+---+


scala> df.groupBy('id).agg(countDistinct('age) as 'distinctAge).show(false)
+---+-----------+
|id |distinctAge|
+---+-----------+
|b  |1          |
|a  |2          |
+---+-----------+


scala> df.groupBy("id").agg(countDistinct("age") as "distinctAge").show(false)
+---+-----------+                                                               
|id |distinctAge|
+---+-----------+
|b  |1          |
|a  |2          |
+---+-----------+
原文地址：https://www.cnblogs.com/ji-hf/p/13665911.html