Logstash
- Added Local Pipeline to the build to allow custom local configurations
- updated sysmon config to fix https://github.com/Cyb3rWard0g/HELK/issues/63
- removed port exposed in local logstash Dockerfile. It will be pushed to official docker image in the next update
- removed logstash init file (not being used anymore)

Zeppelin
- not available yet
- initial draft dockerfile
- created spark-defaults file for future zeppelin dockerfile

Install Script
- incrased minimum memory size required
keyword-vs-text-changes
Roberto Rodriguez 2018-05-31 02:08:15 -04:00
parent bb321d985a
commit f3a0e251ea
8 changed files with 180 additions and 259 deletions

View File

@ -20,6 +20,8 @@ services:
helk-logstash:
image: cyb3rward0g/helk-logstash:6.2.4
container_name: helk-logstash
volumes:
- ./helk-logstash/pipeline:/usr/share/logstash/pipeline
environment:
- "LS_JAVA_OPTS=-Xms2g -Xmx2g"
restart: always

View File

@ -15,6 +15,4 @@ LABEL description="Dockerfile base for the HELK Logstash."
# *********** Adding HELK scripts, config files and pipeline configs to ontainer ***************
ADD logstash.yml /usr/share/logstash/config/logstash.yml
ADD pipeline /usr/share/logstash/pipeline
ADD output_templates /usr/share/logstash/output_templates
EXPOSE 5044
ADD output_templates /usr/share/logstash/output_templates

View File

@ -1,202 +0,0 @@
#!/bin/sh
# Init script for logstash
# Maintained by Roberto Rodriguez @Cyb3rWard0g
# Reference:
# https://github.com/elastic/logstash/blob/master/distribution/rpm/src/main/packaging/init.d/logstash
# https://github.com/spujadas/elk-docker/blob/master/logstash-init
### BEGIN INIT INFO
# Provides: logstash
# Required-Start: $remote_fs $syslog
# Required-Stop: $remote_fs $syslog
# Default-Start: 2 3 4 5
# Default-Stop: 0 1 6
# Short-Description:
# Description: Starts Logstash as a daemon.
### END INIT INFO
PATH=/sbin:/usr/sbin:/bin:/usr/bin
NAME=logstash
DEFAULT=/etc/default/$NAME
export PATH
if [ $(id -u) -ne 0 ]; then
echo "You need root privileges to run this script"
exit 1
fi
. /lib/lsb/init-functions
if [ -r /etc/default/rcS ]; then
. /etc/default/rcS
fi
# The following variables can be overwritten in $DEFAULT
JAVACMD=/usr/bin/java
LS_HOME=/usr/share/logstash
LS_SETTINGS_DIR=/etc/logstash
LS_CONF_PATH=/etc/logstash/pipeline
LS_LOGS_PATH=/var/log/logstash
LS_LOGS_FILE=${LS_LOGS_PATH}/${NAME}-plain.log
#LS_JAVA_OPTS=""
LS_PIDFILE=/var/run/logstash.pid
LS_USER=logstash
LS_GROUP=logstash
LS_GC_LOG_FILE=/var/log/logstash/gc.log
LS_OPEN_FILES=16384
LS_NICE=19
SERVICE_NAME="logstash"
SERVICE_DESCRIPTION="logstash"
# End of variables that can be overwritten in $DEFAULT
# overwrite settings from default file
if [ -f "$DEFAULT" ]; then
. "$DEFAULT"
fi
# Define other required variables
LS_EXEC=$LS_HOME/bin/logstash
#LS_EXEC_OPTS="--path.settings ${LS_SETTINGS_DIR} --path.config ${LS_CONF_PATH} --path.logs ${LS_LOGS_PATH}"
LS_EXEC_OPTS="--path.settings ${LS_SETTINGS_DIR}"
export LS_JAVA_OPTS
export LS_HOME
export LS_SETTINGS_DIR
if [ ! -x "$LS_EXEC" ]; then
echo "The logstash startup script does not exists or it is not executable, tried: $LS_EXEC"
exit 1
fi
touch ${LS_LOGS_FILE}
chown ${LS_USER}:${LS_GROUP} ${LS_LOGS_FILE}
checkJava() {
if [ -x "$JAVACMD" ]; then
JAVA="$JAVACMD"
else
JAVA=`which java`
fi
if [ ! -x "$JAVA" ]; then
echo "Could not find any executable java binary. Please install java in your PATH or set JAVACMD"
exit 1
fi
}
start() {
checkJava
echo "Starting $NAME"
if [ -n "$LS_PIDFILE" ] && [ ! -e "$LS_PIDFILE" ]; then
touch "$LS_PIDFILE" && chown logstash:logstash "$LS_PIDFILE"
fi
if [ -n "$LS_OPEN_FILES" ]; then
ulimit -n $LS_OPEN_FILES
fi
# Start Service
nice -n$LS_NICE chroot --userspec $LS_USER:$LS_GROUP / sh -c "
cd $LS_HOME
ulimit -n ${LS_OPEN_FILES}
exec $LS_EXEC $LS_EXEC_OPTS
" &
# Generate the pidfile from here. If we instead made the forked process
# generate it there will be a race condition between the pidfile writing
# and a process possibly asking for status.
echo $! > $LS_PIDFILE
echo "$NAME started."
return 0
}
stop() {
# Try a few times to kill TERM the program
if status; then
pid=$(cat "$LS_PIDFILE")
echo "Killing $NAME (pid $pid) with SIGTERM"
kill -TERM $pid
# Wait for it to exit.
for i in 1 2 3 4 5; do
echo "Waiting for $NAME (pid $pid) to die..."
status || break
sleep 1
done
if status; then
echo "$NAME stop failed; still running."
else
echo "$NAME stopped."
rm -f $LS_PIDFILE
fi
fi
}
status() {
if [ -f "$LS_PIDFILE" ] ; then
pid=$(cat "$LS_PIDFILE")
if kill -0 $pid > /dev/null 2> /dev/null; then
# process by this pid is running.
# It may not be our pid, but that's what you get with just pidfiles.
# TODO(sissel): Check if this process seems to be the same as the one we
# expect. It'd be nice to use flock here, but flock uses fork, not exec,
# so it makes it quite awkward to use in this case.
return 0
else
return 2 # program is dead but pid file exists
fi
else
return 3 # program is not running
fi
}
force_stop() {
if status; then
stop
status && kill -KILL $(cat "$LS_PIDFILE")
rm -f $LS_PIDFILE
fi
}
case "$1" in
start)
status
code=$?
if [ $code -eq 0 ]; then
echo "$NAME is already running"
else
start
code=$?
fi
exit $code
;;
stop) stop ;;
force-stop) force_stop ;;
status)
status
code=$?
if [ $code -eq 0 ]; then
echo "$NAME is running"
else
echo "$NAME is not running"
fi
exit $code
;;
restart) stop && start ;;
*)
echo "Usage: $SCRIPTNAME {start|stop|force-stop|status|restart}" >&2
exit 3
;;
esac
exit $?

View File

@ -298,7 +298,8 @@ filter {
remove_field => "[event_data][UtcTime]"
remove_field => "[event_data][CreationUtcTime]"
remove_field => "[event_data][PreviousCreationUtcTime]"
rename => { "[event_data][User]" => "user"}
remove_field => "[user]"
rename => { "[event_data][User]" => "user_account"}
}
}
}

View File

@ -10,86 +10,80 @@ LABEL description="Dockerfile base for the HELK Zeppelin."
ENV DEBIAN_FRONTEND noninteractive
USER root
# *********** Installing Prerequisites ***************
# `Z_VERSION` will be updated by `dev/change_zeppelin_version.sh`
ENV Z_VERSION="0.7.3"
ENV Z_VERSION="0.8."
ENV LOG_TAG="[ZEPPELIN_${Z_VERSION}]:" \
Z_HOME="/zeppelin" \
LANG=en_US.UTF-8 \
LC_ALL=en_US.UTF-8
Z_HOME="/zeppelin"
RUN echo "$LOG_TAG update and install basic packages" && \
apt-get -y update && \
apt-get install -y locales && \
locale-gen $LANG && \
apt-get install -y software-properties-common && \
apt -y autoclean && \
apt -y dist-upgrade && \
apt-get install -y build-essential
ENV Z_GID=710
ENV Z_UID=710
ENV Z_USER=zelk
ENV JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64
RUN echo "$LOG_TAG Install java8" && \
apt-get -y update && \
apt-get install -y openjdk-8-jdk && \
apt-get install -y git openjdk-8-jdk \
libfontconfig git build-essential chrpath \
libssl-dev libxft-dev libfreetype6 libfreetype6-dev \
libfontconfig1 libfontconfig1-dev python3-pip && \
rm -rf /var/lib/apt/lists/*
# should install conda first before numpy, matploylib since pip and python will be installed by conda
RUN echo "$LOG_TAG Install miniconda2 related packages" && \
apt-get -y update && \
apt-get install -y bzip2 ca-certificates \
libglib2.0-0 libxext6 libsm6 libxrender1 \
git mercurial subversion && \
echo 'export PATH=/opt/conda/bin:$PATH' > /etc/profile.d/conda.sh && \
wget --quiet https://repo.continuum.io/miniconda/Miniconda2-4.3.11-Linux-x86_64.sh -O ~/miniconda.sh && \
/bin/bash ~/miniconda.sh -b -p /opt/conda && \
rm ~/miniconda.sh
ENV PATH /opt/conda/bin:$PATH
# *********** Upgrading PIP ***************
RUN pip3 install --upgrade pip
# *********** Create User *****************
RUN groupadd -r zelk -g ${Z_GID} \
&& useradd -m -s /bin/bash -N -u $Z_UID $Z_USER \
&& chmod g+w /etc/passwd /etc/group \
&& chown -R zelk:zelk /usr/local/ /tmp /usr/bin ${SPARK_HOME}
RUN echo "$LOG_TAG Install python related packages" && \
apt-get -y update && \
apt-get install -y python-dev python-pip && \
apt-get install -y gfortran && \
# numerical/algebra packages
# numerical/algebra packages
apt-get install -y libblas-dev libatlas-dev liblapack-dev && \
# font, image for matplotlib
apt-get install -y libpng-dev libfreetype6-dev libxft-dev && \
# for tkinter
apt-get install -y python-tk libxml2-dev libxslt-dev zlib1g-dev && \
pip install numpy && \
pip install matplotlib
pip3 install numpy && \
pip3 install matplotlib
RUN echo "$LOG_TAG Install R related packages" && \
echo "deb http://cran.rstudio.com/bin/linux/ubuntu xenial/" | tee -a /etc/apt/sources.list && \
gpg --keyserver keyserver.ubuntu.com --recv-key E084DAB9 && \
gpg -a --export E084DAB9 | apt-key add - && \
apt-get -y update && \
apt-get -y install r-base r-base-dev && \
R -e "install.packages('knitr', repos='http://cran.us.r-project.org')" && \
R -e "install.packages('ggplot2', repos='http://cran.us.r-project.org')" && \
R -e "install.packages('googleVis', repos='http://cran.us.r-project.org')" && \
R -e "install.packages('data.table', repos='http://cran.us.r-project.org')" && \
# for devtools, Rcpp
apt-get -y install libcurl4-gnutls-dev libssl-dev && \
R -e "install.packages('devtools', repos='http://cran.us.r-project.org')" && \
R -e "install.packages('Rcpp', repos='http://cran.us.r-project.org')" && \
Rscript -e "library('devtools'); library('Rcpp'); install_github('ramnathv/rCharts')"
# ************** Install PhantpmJS ****************
USER $Z_UID
# ************** Install Maven *********************
ENV MAVEN_VERSION 3.5.3
RUN wget wget -qO- http://www.us.apache.org/dist/maven/maven-3/${MAVEN_VERSION}/binaries/apache-maven-${MAVEN_VERSION}-bin.tar.gz | sudo tar xvz -C /usr/local && \
RUN wget -qO- http://www.us.apache.org/dist/maven/maven-3/${MAVEN_VERSION}/binaries/apache-maven-${MAVEN_VERSION}-bin.tar.gz | tar xvz -C /usr/local && \
ln -s /usr/local/apache-maven-${MAVEN_VERSION}/bin/mvn /usr/bin/mvn
RUN echo "$LOG_TAG Download Zeppelin binary" && \
wget -O /tmp/zeppelin-${Z_VERSION}-bin-all.tgz http://archive.apache.org/dist/zeppelin/zeppelin-${Z_VERSION}/zeppelin-${Z_VERSION}-bin-all.tgz && \
tar -zxvf /tmp/zeppelin-${Z_VERSION}-bin-all.tgz && \
rm -rf /tmp/zeppelin-${Z_VERSION}-bin-all.tgz && \
mv /zeppelin-${Z_VERSION}-bin-all ${Z_HOME}
USER root
# ************** Install Zeppelin *********************
RUN echo "$LOG_TAG Download Zeppelin" && \
cd /tmp && git clone --branch v0.8.0-rc2 https://github.com/apache/zeppelin.git && \
mv /tmp/zeppelin ${Z_HOME}
RUN chown -R zelk:zelk ${Z_HOME}
USER $Z_UID
RUN cd $Z_HOME && \
mvn clean package -DskipTests -X
# *********** Install CAPS ***************
RUN cd ${Z_HOME} && \
wget https://github.com/opencypher/cypher-for-apache-spark/releases/download/1.0.0-beta7/spark-cypher-1.0.0-beta7-cluster.jar
ADD spark-defaults.conf ${SPARK_HOME}/conf/
USER root
RUN echo "$LOG_TAG Cleanup" && \
apt-get autoclean && \
apt-get clean
EXPOSE 8080
EXPOSE $ZEPPELIN_PORT
WORKDIR ${Z_HOME}
CMD ["bin/zeppelin.sh"]

View File

@ -0,0 +1,38 @@
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# Default system properties included when running spark-submit.
# This is useful for setting default environmental settings.
# Example:
# spark.master spark://master:7077
# spark.eventLog.enabled true
# spark.eventLog.dir hdfs://namenode:8021/directory
# spark.serializer org.apache.spark.serializer.KryoSerializer
# spark.driver.memory 5g
# spark.executor.extraJavaOptions -XX:+PrintGCDetails -Dkey=value -Dnumbers="one two three"
# HELK References:
# https://spark.apache.org/docs/latest/configuration.html
# https://graphframes.github.io/quick-start.html
# https://spark-packages.org/package/graphframes/graphframes
# https://spark.apache.org/docs/latest/sql-programming-guide.html#pyspark-usage-guide-for-pandas-with-apache-arrow
#spark.master spark://helk-spark-master:7077
spark.jars /zeppelin/spark-cypher-1.0.0-beta7-cluster.jar
spark.jars.packages graphframes:graphframes:0.5.0-spark2.1-s_2.11,org.apache.spark:spark-sql-kafka-0-10_2.11:2.3.0,databricks:spark-sklearn:0.2.3
spark.sql.execution.arrow.enabled true

View File

@ -0,0 +1,90 @@
#!/bin/bash
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# export JAVA_HOME=
# export MASTER= # Spark master url. eg. spark://master_addr:7077. Leave empty if you want to use local mode.
# export ZEPPELIN_JAVA_OPTS # Additional jvm options. for example, export ZEPPELIN_JAVA_OPTS="-Dspark.executor.memory=8g -Dspark.cores.max=16"
# export ZEPPELIN_MEM # Zeppelin jvm mem options Default -Xms1024m -Xmx1024m -XX:MaxPermSize=512m
# export ZEPPELIN_INTP_MEM # zeppelin interpreter process jvm mem options. Default -Xms1024m -Xmx1024m -XX:MaxPermSize=512m
# export ZEPPELIN_INTP_JAVA_OPTS # zeppelin interpreter process jvm options.
# export ZEPPELIN_SSL_PORT # ssl port (used when ssl environment variable is set to true)
# export ZEPPELIN_LOG_DIR # Where log files are stored. PWD by default.
# export ZEPPELIN_PID_DIR # The pid files are stored. ${ZEPPELIN_HOME}/run by default.
# export ZEPPELIN_WAR_TEMPDIR # The location of jetty temporary directory.
# export ZEPPELIN_NOTEBOOK_DIR # Where notebook saved
# export ZEPPELIN_NOTEBOOK_HOMESCREEN # Id of notebook to be displayed in homescreen. ex) 2A94M5J1Z
# export ZEPPELIN_NOTEBOOK_HOMESCREEN_HIDE # hide homescreen notebook from list when this value set to "true". default "false"
# export ZEPPELIN_NOTEBOOK_S3_BUCKET # Bucket where notebook saved
# export ZEPPELIN_NOTEBOOK_S3_ENDPOINT # Endpoint of the bucket
# export ZEPPELIN_NOTEBOOK_S3_USER # User in bucket where notebook saved. For example bucket/user/notebook/2A94M5J1Z/note.json
# export ZEPPELIN_NOTEBOOK_S3_KMS_KEY_ID # AWS KMS key ID
# export ZEPPELIN_NOTEBOOK_S3_KMS_KEY_REGION # AWS KMS key region
# export ZEPPELIN_IDENT_STRING # A string representing this instance of zeppelin. $USER by default.
# export ZEPPELIN_NICENESS # The scheduling priority for daemons. Defaults to 0.
# export ZEPPELIN_INTERPRETER_LOCALREPO # Local repository for interpreter's additional dependency loading
# export ZEPPELIN_INTERPRETER_DEP_MVNREPO # Remote principal repository for interpreter's additional dependency loading
# export ZEPPELIN_HELIUM_NPM_REGISTRY # Remote Npm registry for Helium dependency loader
# export ZEPPELIN_NOTEBOOK_STORAGE # Refers to pluggable notebook storage class, can have two classes simultaneously with a sync between them (e.g. local and remote).
# export ZEPPELIN_NOTEBOOK_ONE_WAY_SYNC # If there are multiple notebook storages, should we treat the first one as the only source of truth?
# export ZEPPELIN_NOTEBOOK_PUBLIC # Make notebook public by default when created, private otherwise
#### Spark interpreter configuration ####
## Use provided spark installation ##
## defining SPARK_HOME makes Zeppelin run spark interpreter process using spark-submit
##
# export SPARK_HOME # (required) When it is defined, load it instead of Zeppelin embedded Spark libraries
# export SPARK_SUBMIT_OPTIONS # (optional) extra options to pass to spark submit. eg) "--driver-memory 512M --executor-memory 1G".
# export SPARK_APP_NAME # (optional) The name of spark application.
## Use embedded spark binaries ##
## without SPARK_HOME defined, Zeppelin still able to run spark interpreter process using embedded spark binaries.
## however, it is not encouraged when you can define SPARK_HOME
##
# Options read in YARN client mode
# export HADOOP_CONF_DIR # yarn-site.xml is located in configuration directory in HADOOP_CONF_DIR.
# Pyspark (supported with Spark 1.2.1 and above)
# To configure pyspark, you need to set spark distribution's path to 'spark.home' property in Interpreter setting screen in Zeppelin GUI
# export PYSPARK_PYTHON # path to the python command. must be the same path on the driver(Zeppelin) and all workers.
# export PYTHONPATH
## Spark interpreter options ##
##
# export ZEPPELIN_SPARK_USEHIVECONTEXT # Use HiveContext instead of SQLContext if set true. true by default.
# export ZEPPELIN_SPARK_CONCURRENTSQL # Execute multiple SQL concurrently if set true. false by default.
# export ZEPPELIN_SPARK_IMPORTIMPLICIT # Import implicits, UDF collection, and sql if set true. true by default.
# export ZEPPELIN_SPARK_MAXRESULT # Max number of Spark SQL result to display. 1000 by default.
# export ZEPPELIN_WEBSOCKET_MAX_TEXT_MESSAGE_SIZE # Size in characters of the maximum text message to be received by websocket. Defaults to 1024000
#### HBase interpreter configuration ####
## To connect to HBase running on a cluster, either HBASE_HOME or HBASE_CONF_DIR must be set
# export HBASE_HOME= # (require) Under which HBase scripts and configuration should be
# export HBASE_CONF_DIR= # (optional) Alternatively, configuration directory can be set to point to the directory that has hbase-site.xml
#### ZeppelinHub connection configuration ####
# export ZEPPELINHUB_API_ADDRESS # Refers to the address of the ZeppelinHub service in use
# export ZEPPELINHUB_API_TOKEN # Refers to the Zeppelin instance token of the user
# export ZEPPELINHUB_USER_KEY # Optional, when using Zeppelin with authentication.
#### Zeppelin impersonation configuration
# export ZEPPELIN_IMPERSONATE_CMD # Optional, when user want to run interpreter as end web user. eg) 'sudo -H -u ${ZEPPELIN_IMPERSONATE_USER} bash -c '
# export ZEPPELIN_IMPERSONATE_SPARK_PROXY_USER #Optional, by default is true; can be set to false if you don't want to use --proxy-user option with Spark interpreter when impersonation enabled

View File

@ -28,8 +28,8 @@ check_min_requirements(){
AVAILABLE_MEMORY=$(free -hm | awk 'NR==2{printf "%.f\t\t", $4 }')
ES_MEMORY=$(free -hm | awk 'NR==2{printf "%.f", $4/2 }')
AVAILABLE_DISK=$(df -h | awk '$NF=="/"{printf "%.f\t\t", $4}')
if [ "${AVAILABLE_MEMORY}" -ge "10" ] && [ "${AVAILABLE_DISK}" -ge "30" ]; then
if [ "${AVAILABLE_MEMORY}" -ge "12" ] && [ "${AVAILABLE_DISK}" -ge "30" ]; then
echo "[HELK-INSTALLATION-INFO] Available Memory: $AVAILABLE_MEMORY"
echo "[HELK-INSTALLATION-INFO] Available Disk: $AVAILABLE_DISK"
else