Indico celebrates its 20th anniversary! Check our blog post for more information!

Hadoop and Spark User Forum

Europe/Zurich
31/S-028 (CERN)

31/S-028

CERN

30
Show room on map
    • 14:00 14:30
      Migration to Apache Hadoop3 30m

      This presentation will introduce plans for 2020 regarding migrating all the cluster managed by IT to Apache Hadoop3

      Speaker: Emil Kleszcz (CERN)
    • 14:30 15:00
      Introduction to Presto - distributed storage agnostic SQL layer 30m

      This presentation will introduce Presto, a SQL abstract layer on top of Big data stores

      Speakers: Zbigniew Baranowski (CERN), Vasileios Dimakopoulos (Technical University of Crete (GR))

      CREATE TABLE hive_hadoop3.zbaranow_db.collectd(

        dstype varchar ,
        host varchar ,
        interval double ,
        plugin varchar ,
        plugin_instance varchar ,
        time bigint ,
        type varchar ,
        type_instance varchar ,
        env varchar ,
        region varchar ,
        dc varchar ,
        value double ,
        value_instance varchar ,
        _id varchar ,
        availability_zone varchar ,
        event_timestamp bigint ,
        submitter_environment varchar ,
        submitter_hostgroup varchar ,
        timestamp bigint ,
        toplevel_hostgroup varchar ,
        version varchar ,
        year int,
        month int,
        day int
      )
      WITH (
        format = 'PARQUET',
        external_location = '/project/monitoring/collectd/database',
        partitioned_by = ARRAY['year','month','day']
      );

      use hive_hadoop3;

      SELECT * FROM hive_hadoop3.zbaranow_db."collectd$partitions"

      call system.sync_partition_metadata('zbaranow_db', 'collectd', 'ADD')

      SELECT * FROM hive_hadoop3.zbaranow_db."collectd$partitions"


      select count(*) from hive_hadoop3.zbaranow_db.collectd;

      select month, count(*) from hive_hadoop3.zbaranow_db.collectd group by month;

      select month, avg(value)/1024/1024 from hive_hadoop3.zbaranow_db.collectd where plugin='hadoop' and type_instance='BytesWritten' group by month ;

      create table phoenix_hadoop3.default.collectd_hadoop WITH (  rowkeys = 'type_instance,time,submitter_hostgroup,host',  compression = 'SNAPPY',  salt_buckets = 20) as select host,plugin,plugin_instance,time,value as val,type_instance,submitter_hostgroup,submitter_environment from hive_hadoop3.zbaranow_db.collectd where plugin='hadoop';