Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- # run this script with the user `hduser` or any user of your choice
- # jdk
- wget https://github.com/frekele/oracle-java/releases/download/8u212-b10/jdk-8u212-linux-x64.tar.gz
- tar zxvf jdk-8u212-linux-x64.tar.gz
- sudo mv jdk1.8.0_212 /usr/java
- # ssh
- sudo apt-get install openssh-server # 'sudo dnf install openssh-server' for centos
- ssh-keygen -t rsa -P ""
- cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys
- chmod 0600 ~/.ssh/authorized_keys
- # verification
- ssh localhost
- # if "connection closed" redo `ssh`
- exit
- # hadoop
- wget http:/archive.apache.org/dist/hadoop/common/hadoop-3.3.0/hadoop-3.3.0.tar.gz
- tar xzf hadoop-3.3.0.tar.gz
- sudo mv hadoop-3.3.0 /opt/hadoop
- sudo chown `whoami` -R /opt/hadoop
- # env variables
- cd ~ && nano .bashrc
- # paste and save
- export JAVA_HOME=/usr/java/
- export PATH=$PATH:$JAVA_HOME/bin
- export HADOOP_HOME=/opt/hadoop/
- export HADOOP_INSTALL=$HADOOP_HOME
- export HADOOP_MAPRED_HOME=$HADOOP_HOME
- export HADOOP_COMMON_HOME=$HADOOP_HOME
- export HADOOP_HDFS_HOME=$HADOOP_HOME
- export YARN_HOME=$HADOOP_HOME
- export PATH=$PATH:$HADOOP_HOME/sbin:$HADOOP_HOME/bin
- export HADOOP_COMMON_LIB_NATIVE_DIR=$HADOOP_HOME/lib/native
- export HADOOP_OPTS="-Djava.library.path=$HADOOP_INSTALL/lib/native"
- export HADOOP_CLASSPATH=$(hadoop classpath)
- export HADOOP_CLASSPATH=$JAVA_HOME/jre/lib:$JAVA_HOME/lib:$JAVA_HOME/lib/tools.jar
- source .bashrc
- nano $HADOOP_HOME/etc/hadoop/hadoop-env.sh
- # add and save
- export JAVA_HOME=/usr/java/
- sudo mkdir -p /opt/hadoop/hadoopdata/namenode
- sudo mkdir -p /opt/hadoop/hadoopdata/datanode
- # config files
- cd $HADOOP_HOME/etc/hadoop
- nano core-site.xml
- # paste and save
- <configuration>
- <property>
- <name>fs.default.name</name>
- <value>hdfs://localhost:9000</value>
- </property>
- </configuration>
- nano hdfs-site.xml
- # paste and save
- <configuration>
- <property>
- <name>dfs.replication</name>
- <value>1</value>
- </property>
- <property>
- <name>dfs.name.dir</name>
- <value>file:/opt/hadoop/hadoopdata/namenode</value>
- </property>
- <property>
- <name>dfs.data.dir</name>
- <value>file:/opt/hadoop/hadoopdata/datanode</value>
- </property>
- </configuration>
- nano mapred-site.xml
- # paste and save
- <configuration>
- <property>
- <name>mapreduce.framework.name</name>
- <value>yarn</value>
- </property>
- <property>
- <name>mapreduce.application.classpath</name>
- <value>$HADOOP_MAPRED_HOME/share/hadoop/mapreduce/*:$HADOOP_MAPRED_HOME/share/hadoop/mapreduce/lib/*</value>
- </property>
- </configuration>
- nano yarn-site.xml
- # paste and save
- <configuration>
- <property>
- <name>yarn.nodemanager.aux-services</name>
- <value>mapreduce_shuffle</value>
- </property>
- <property>
- <name>yarn.nodemanager.env-whitelist</name> <value>JAVA_HOME,HADOOP_COMMON_HOME,HADOOP_HDFS_HOME,HADOOP_CONF_DIR,CLASSPATH_PREPEND_DISTCACHE,HADOOP_YARN_HOME,HADOOP_MAPRED_HOME</value>
- </property>
- </configuration>
- # formattage du hdfs
- hdfs namenode -format
- # start
- start-all.sh
- ## TP Python
- sudo apt-get install python2 # 'sudo dnf install python2' for centos
- cd ~
- # put words in `wcount.txt`
- nano wcount.txt
- hdfs dfs -put wcount.txt /wcount.txt
- # code
- # https://www.michael-noll.com/tutorials/writing-an-hadoop-mapreduce-program-in-python/
- mkdir WordCountPython
- # copy paste code
- nano WordCountPython/WordCount_reducer.py
- nano WordCountPython/WordCount_mapper.py
- # test w/o hadoop
- cat wcount.txt | python2 WordCountPython/WordCount_mapper.py | sort -k1,1 | python2 WordCountPython/WordCount_reducer.py
- # hadoop
- hdfs dfsadmin -safemode leave
- # execute (copy the whole command)
- hadoop jar /opt/hadoop/share/hadoop/tools/lib/hadoop-streaming-3.3.0.jar -file WordCountPython/WordCount_mapper.py WordCountPython/WordCount_reducer.py \
- -mapper "python2 WordCount_mapper.py" -reducer "python2 WordCount_reducer.py" -input /wcount.txt -output /wcountpython
- # result
- hdfs dfs -cat /wcountpython/part-00000
Add Comment
Please, Sign In to add comment