summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorSuren A. Chilingaryan <csa@suren.me>2018-07-05 06:29:09 +0200
committerSuren A. Chilingaryan <csa@suren.me>2018-07-05 06:29:09 +0200
commit2c3f1522274c09f7cfdb6309adc0719f05c188e9 (patch)
treee54e0c26f581543f48e945f186734e4bd9a8f15a
parent8af0865a3a3ef783b36016c17598adc9d932981d (diff)
downloadands-2c3f1522274c09f7cfdb6309adc0719f05c188e9.tar.gz
ands-2c3f1522274c09f7cfdb6309adc0719f05c188e9.tar.bz2
ands-2c3f1522274c09f7cfdb6309adc0719f05c188e9.tar.xz
ands-2c3f1522274c09f7cfdb6309adc0719f05c188e9.zip
Update monitoring scripts to track leftover OpenVSwitch 'veth' interfaces and clean them up pereodically to avoid performance degradation, split kickstart
-rw-r--r--docs/consistency.txt12
-rw-r--r--docs/kickstart.txt12
-rw-r--r--docs/logs.txt36
-rw-r--r--docs/problems.txt103
-rw-r--r--docs/projects/katrindb.txt255
-rw-r--r--docs/troubleshooting.txt18
-rw-r--r--kickstart/README9
-rwxr-xr-xkickstart/autocd/build.sh17
-rw-r--r--kickstart/autocd/isolinux/isolinux.cfg76
-rwxr-xr-xkickstart/ipmi.sh157
-rw-r--r--kickstart/kickstart/authorized_keys4
-rw-r--r--kickstart/kickstart/authorized_keys.csa4
-rw-r--r--kickstart/kickstart/index.php182
-rw-r--r--kickstart/kickstart/ipekatrin-v4.ks100
-rwxr-xr-xkickstart/testvm/centos.sh9
-rw-r--r--opts.sh1
-rw-r--r--playbooks/openshift-add-project.yml31
-rw-r--r--roles/ands_kaas/tasks/do_project.yml5
-rw-r--r--roles/ands_kaas/tasks/project.yml2
-rw-r--r--roles/ands_monitor/tasks/main.yml13
-rw-r--r--roles/ands_monitor/templates/cron/maintain.j24
-rwxr-xr-xroles/ands_monitor/templates/scripts/check_server_status.sh.j211
-rwxr-xr-xroles/ands_monitor/templates/scripts/check_uptime_status.sh.j29
-rwxr-xr-xroles/ands_monitor/templates/scripts/clean_rogue_interfaces.sh.j218
-rwxr-xr-xroles/ands_monitor/templates/scripts/list_containers.sh.j23
-rwxr-xr-xroles/ands_monitor/templates/scripts/maintain.sh.j28
-rwxr-xr-xsetup.sh6
-rw-r--r--setup/configs/openshift.yml5
-rw-r--r--setup/configs/security.yml1
-rw-r--r--setup/users/htpasswd5
30 files changed, 549 insertions, 567 deletions
diff --git a/docs/consistency.txt b/docs/consistency.txt
index caaaf36..dcf311a 100644
--- a/docs/consistency.txt
+++ b/docs/consistency.txt
@@ -39,7 +39,17 @@ Networking
- Ensure, we don't have override of cluster_name to first master (which we do during the
provisioning of OpenShift plays)
-
+
+ - Sometimes OpenShift fails to clean-up after terminated pod properly. This causes rogue
+ network interfaces to remain in OpenVSwitch fabric. This can be determined by errors like:
+ could not open network device vethb9de241f (No such device)
+ reported by 'ovs-vsctl show' or present in the log '/var/log/openvswitch/ovs-vswitchd.log'
+ which may quickly grow over 100MB quickly. If number of rogue interfaces grows too much,
+ the pod scheduling will start time-out on the affected node.
+ * The work-around is to delete rogue interfaces with
+ ovs-vsctl del-port br0 <iface>
+ This does not solve the problem, however. The new interfaces will get abandoned by OpenShift.
+
ADEI
====
diff --git a/docs/kickstart.txt b/docs/kickstart.txt
index 1331542..b94b0f6 100644
--- a/docs/kickstart.txt
+++ b/docs/kickstart.txt
@@ -11,4 +11,14 @@ Troubleshooting
dmsetup remove_all
dmsetup remove <name>
- \ No newline at end of file
+ - Sometimes even this does not help.
+ > On CentOS 7.4 mdadm does not recognize the disk, but LVM thinks it is
+ part of MD. Then cleaning last megabytes of the former md partition may help.
+ > On Fedora 28, mdadm detects the old array and tries to "tear down" it down, but
+ fails as raid array is already innactive.
+
+ * If raid is still more-or-less healthy. It can be destroyed with
+ mdadm --zero-superblock /dev/sdb3
+ * Otherwise:
+ dd if=/dev/zero of=/dev/sda4 bs=512 seek=$(( $(blockdev --getsz /dev/sda4) - 1024 )) count=1024
+
diff --git a/docs/logs.txt b/docs/logs.txt
new file mode 100644
index 0000000..e27b1ff
--- /dev/null
+++ b/docs/logs.txt
@@ -0,0 +1,36 @@
+/var/log/messages
+=================
+ - Various RPC errors.
+ ... rpc error: code = # desc = xxx ...
+
+ - container kill failed because of 'container not found' or 'no such process': Cannot kill container ###: rpc error: code = 2 desc = no such process"
+ Despite the errror, the containers are actually killed and pods destroyed. However, this error likely triggers
+ problem with rogue interfaces staying on the OpenVSwitch bridge.
+
+ - containerd: unable to save f7c3e6c02cdbb951670bc7ff925ddd7efd75a3bb5ed60669d4b182e5337dec23:d5b9394468235f7c9caca8ad4d97e7064cc49cd59cadd155eceae84545dc472a starttime: read /proc/81994/stat: no such process
+ containerd: f7c3e6c02cdbb951670bc7ff925ddd7efd75a3bb5ed60669d4b182e5337dec23:d5b9394468235f7c9caca8ad4d97e7064cc49cd59cadd155eceae84545dc472a (pid 81994) has become an orphan, killing it
+ Seems a bug in docker 1.12* which is resolved in 1.13.0rc2. No side effects according to the issue.
+ https://github.com/moby/moby/issues/28336
+
+ - W0625 03:49:34.231471 36511 docker_sandbox.go:337] failed to read pod IP from plugin/docker: NetworkPlugin cni failed on the status hook for pod "...": Unexpected command output nsenter: cannot open /proc/63586/ns/net: No such file or directory
+ - W0630 21:40:20.978177 5552 docker_sandbox.go:337] failed to read pod IP from plugin/docker: NetworkPlugin cni failed on the status hook for pod "...": CNI failed to retrieve network namespace path: Cannot find network namespace for the terminated container "..."
+ Probably refered by the following bug report and accordingly can be ignored...
+ https://bugzilla.redhat.com/show_bug.cgi?id=1434950
+
+ - E0630 14:05:40.304042 5552 glusterfs.go:148] glusterfs: failed to get endpoints adei-cfg[an empty namespace may not be set when a resource name is provided]
+ E0630 14:05:40.304062 5552 reconciler.go:367] Could not construct volume information: MountVolume.NewMounter failed for volume "kubernetes.io/glusterfs/4
+ I guess some configuration issue.... Probably can be ignored...
+
+ - kernel: SELinux: mount invalid. Same superblock, different security settings for (dev mqueue, type mqueue)
+ There are no adverse effects to this. It is a potential kernel issue, but should be just ignored by the customer. Nothing is going to break.
+ https://bugzilla.redhat.com/show_bug.cgi?id=1425278
+
+
+ - E0625 03:59:52.438970 23953 watcher.go:210] watch chan error: etcdserver: mvcc: required revision has been compacted
+ seems fine and can be ignored.
+
+
+/var/log/openvswitch/ovs-vswitchd.log
+=====================================
+ - bridge|WARN|could not open network device veth7d33a20f (No such device)
+ Indicates cleanup pod-cleanup failure and may cause problems during pod-scheduling.
diff --git a/docs/problems.txt b/docs/problems.txt
new file mode 100644
index 0000000..4be9dc7
--- /dev/null
+++ b/docs/problems.txt
@@ -0,0 +1,103 @@
+Actions Required
+================
+ * Long-term solution to 'rogue' interfaces is unclear. May require update to OpenShift 3.9 or later.
+ However, proposed work-around should do unless execution rate grows significantly.
+ * All other problems found in logs can be ignored.
+
+
+Rogue network interfaces on OpenVSwitch bridge
+==============================================
+ Sometimes OpenShift fails to clean-up after terminated pod properly. The actual reason is unclear.
+ * The issue is discussed here:
+ https://bugzilla.redhat.com/show_bug.cgi?id=1518684
+ * And can be determined by looking into:
+ ovs-vsctl show
+
+ Problems:
+ * As number of rogue interfaces grow, it start to have impact on performance. Operations with
+ ovs slows down and at some point the pods schedulled to the affected node fail to start due to
+ timeouts. This is indicated in 'oc describe' as: 'failed to create pod sandbox'
+
+ Cause:
+ * Unclear, but it seems periodic ADEI cron jobs causes the issue.
+ * Could be related to 'container kill failed' problem explained in the section bellow.
+ Cannot kill container ###: rpc error: code = 2 desc = no such process
+
+
+ Solutions:
+ * According to RedHat the temporal solution is to reboot affected node (not tested yet). The problem
+ should go away, but may re-apper after a while.
+ * The simplest work-around is to just remove rogue interface. They will be re-created, but performance
+ problems only starts after hundreds accumulate.
+ ovs-vsctl del-port br0 <iface>
+
+ Status:
+ * Cron job is installed which cleans rogue interfaces as they number hits 25.
+
+
+Orphaning / pod termination problems in the logs
+================================================
+ There is several classes of problems reported with unknow reprecursions in the system log. Currently, I
+ don't see any negative side effects except some of these issues may trigger "rogue interfaces" problem.
+
+ ! container kill failed because of 'container not found' or 'no such process': Cannot kill container ###: rpc error: code = 2 desc = no such process"
+
+ Despite the errror, the containers are actually killed and pods destroyed. However, this error likely triggers
+ problem with rogue interfaces staying on the OpenVSwitch bridge.
+
+ Scenario:
+ * happens with short-living containers
+
+ - containerd: unable to save f7c3e6c02cdbb951670bc7ff925ddd7efd75a3bb5ed60669d4b182e5337dec23:d5b9394468235f7c9caca8ad4d97e7064cc49cd59cadd155eceae84545dc472a starttime: read /proc/81994/stat: no such process
+ containerd: f7c3e6c02cdbb951670bc7ff925ddd7efd75a3bb5ed60669d4b182e5337dec23:d5b9394468235f7c9caca8ad4d97e7064cc49cd59cadd155eceae84545dc472a (pid 81994) has become an orphan, killing it
+
+ Scenario:
+ This happens every couple of minutes and attributed to perfectely alive and running pods.
+ * For instance, ipekatrin1 was complaining some ADEI pod.
+ * After I removed this pod, it immidiately started complaining on 'glusterfs' replica.
+ * If 'glusterfs' pod re-created, the problem persist.
+ * It seems only a single pod is affected at each given moment (at least this was always true
+ on ipekatrin1 & ipekatrin2 while I was researching the problem)
+
+ Relations:
+ * This problem is not aligned with the next 'container not found' problem. One happens with short-living containers which
+ actually get destroyed. This one is triggered for persistent container which keep going. And in fact this problem is triggered
+ significantly more frequently.
+
+ Cause:
+ * Seems related to docker health checks due to a bug in docker 1.12* which is resolved in 1.13.0rc2
+ https://github.com/moby/moby/issues/28336
+
+ Problems:
+ * It seems only extensive logging, according to the discussion in the issue
+
+ Solution: Ignore for now
+ * docker-1.13 had some problems with groups (I don't remember exactly) and it was decided to not run it with current version of KaaS.
+ * Only update docker after extensive testing on the development cluster or not at all.
+
+ - W0625 03:49:34.231471 36511 docker_sandbox.go:337] failed to read pod IP from plugin/docker: NetworkPlugin cni failed on the status hook for pod "...": Unexpected command output nsenter: cannot open /proc/63586/ns/net: No such file or directory
+ - W0630 21:40:20.978177 5552 docker_sandbox.go:337] failed to read pod IP from plugin/docker: NetworkPlugin cni failed on the status hook for pod "...": CNI failed to retrieve network namespace path: Cannot find network namespace for the terminated container "..."
+ Scenario:
+ * It seems can be ignored, see RH bug.
+ * Happens with short-living containers (adei cron jobs)
+
+ Relations:
+ * This is also not aligned with 'container not found'. The time in logs differ significantly.
+ * It is also not aligned with 'orphan' problem.
+
+ Cause:
+ ? https://bugzilla.redhat.com/show_bug.cgi?id=1434950
+
+ - E0630 14:05:40.304042 5552 glusterfs.go:148] glusterfs: failed to get endpoints adei-cfg[an empty namespace may not be set when a resource name is provided]
+ E0630 14:05:40.304062 5552 reconciler.go:367] Could not construct volume information: MountVolume.NewMounter failed for volume "kubernetes.io/glusterfs/4
+
+ I guess some configuration issue.... Probably can be ignored...
+
+ Scenario:
+ * Reported on long running pods with persistent volumes (katrin, adai-db)
+ * Also seems an unrelated set of the problems.
+
+
+
+
+
diff --git a/docs/projects/katrindb.txt b/docs/projects/katrindb.txt
new file mode 100644
index 0000000..0a14a25
--- /dev/null
+++ b/docs/projects/katrindb.txt
@@ -0,0 +1,255 @@
+# Steps to setup KDB infrastructure in OpenShift
+
+Web interface: https://kaas.kit.edu:8443/console/
+
+Commandline interface:
+```
+oc login kaas.kit.edu:8443
+oc project katrin
+```
+
+
+## Overview
+
+The setup uses (at least) three containers:
+* `kdb-backend` is a MySQL/MariaDB container that provides the database backend
+ used by KDB server. It hosts the `katrin` and `katrin_run` databases.
+* `kdb-server` runs the KDB server process inside an Apache environment. It
+ provides the web interface (`kdb-admin.fcgi`) and the KaLi service
+ (`kdb-kali.fcgi`).
+* `run-processing` periodically retrieves run files from several DAQ machines
+ and adds the processed files to the KDB runlist. This process could be
+ distributed over several containers for the individual systems (`fpd` etc.)
+
+> The ADEI server hosting the `adei` MySQL database runs in an independent project with hostname `mysql.adei.svc`.
+
+A persistent storage volume is needed for the MySQL data (volume group `db`)
+and for the copied/processed run files (volume group `katrin`). The latter one
+is shared between the KDB server and run processing applications.
+
+
+## MySQL backend
+
+### Application
+
+This container is based on the official Redhat MariaDB Docker image. The
+OpenShift application is created via the CLI:
+```
+oc new-app -e MYSQL_ROOT_PASSWORD=XXX --name=kdb-backend registry.access.redhat.com/rhscl/mariadb-101-rhel7
+```
+Because KDB uses two databases (`katrin`, `katrin_run`) and must be permitted
+to create/edit database users, it is required to define a root password here.
+
+### Volumes
+
+This container needs a persistent storage volume for the database content. In
+OpenShift this is done by removing the default storage and adding a persistent
+volume `kdb-backend` for MySQL data: `db: /kdb/mysql/data -> /var/lib/mysql/data`
+
+### Final steps
+
+It makes sense to add readiness/liveness probes as well: TCP socket, port 3306.
+
+> It is possible to access the MySQL server inside a container: `mysql -h kdb-backend.katrin.svc -u root -p -A`
+
+
+## KDB server
+
+### Application
+
+The container is created from a `Dockerfile` available in GitLab:
+https://nuserv.uni-muenster.de:8443/katrin-git/Dockerfiles/tree/kdbserver
+
+The app is created via the CLI, but manual changes are necessary later on:
+```
+oc new-app https://nuserv.uni-muenster.de:8443/katrin-git/Dockerfiles.git --name=kdb-server
+```
+
+> The build fails because the branch name and user credentials are not defined.
+
+The build settings must be adapted before the image can be created.
+* Set the git branch name to `kdbserver`.
+* Add a source secret `katrin-gitlab` that provides the git user credentials,
+ i.e. the `katrin` username and corresponding password for read-only access.
+
+When a container instance (pod) is created in OpenShift, the main script
+`/run-httpd.sh` starts the Apache webserver with the KDB fastcgi module.
+
+### Volumes
+
+Just like the MySQL backend, the container needs persistent storage enabled: `katrin: /data -> /mnt/katrin/data`
+
+### Config Maps
+
+Some default configuration files for the Apache web server and the KDB server
+installation are provided with the Dockerfile. The webserver config should
+work correctly as it is. The main config must be updated so that the correct
+servers/databases are used. A config map `kdbserver-config` is created with
+mountpoint `/config` in the container:
+* `kdbserver.conf` is the main config for the KDB server instance. For the
+ steps outlined here, it should contain the following entries:
+
+```
+sql_server = kdb-backend.katrin.svc
+sql_adei_server = mysql.adei.svc
+
+sql_katrin_dbname = katrin
+sql_run_dbname = katrin_run
+sql_adei_dbname = adei_katrin
+
+sql_user = root
+sql_password = XXX
+sql_adei_user = katrin
+sql_adei_password = XXX
+
+use_adei_cache = true
+adei_service_url = http://adei-katrin.kaas.kit.edu/adei
+adei_public_url = http://katrin.kit.edu/adei-katrin
+```
+* `log4cxx.properties` defines the terminal/logfile output settings. By default,
+ all log output is shown on `stdout` (and visible in the OpenShift log).
+
+> Files in `/config` are symlinked to the respective files inside the container by `/run-httpd.sh`.
+
+### Database setup
+
+The KDB server sources provide a SQL dump file to initialize the database. To
+create an empty database with all necessary tables, run the `mysql` command:
+```
+mysql -h kdb-backend.katrin.svc -u root -p < /src/kdbserver/Data/katrin-db.sql
+```
+
+Alternatively, a full backup of the existing database can be imported:
+```
+tar -xJf /src/kdbserver/Data/katrin-db-bkp.sql.xz -C /tmp
+mysql -h kdb-backend.katrin.svc -u root -p < /tmp/katrin-db-bkp.sql
+```
+
+> To clean a database table, execute a MySQL `drop table` statement and re-initialize the dropped tables from the `katrin-db.sql` file.
+
+### IDLE storage
+
+IDLE provides a local storage on the server-side file system. An empty IDLE
+repository with default datasets is created by executing this command:
+```
+/opt/kasper/bin/idle SetupPublicDatasets
+```
+
+This creates a directory `.../storage/idle/KatrinIdle` on the storage volume
+that can be filled with contents from a backup archive. The `oc rsync` command
+allows to transfer files to a running container (pod) in OpenShift.
+
+> After restoring one should fix all permissions so that KDB can access the data.
+
+
+
+### Final steps
+
+Again a readiness/liveness probe can be added: TCP socket, port 80.
+
+To make the KDB server interface accessible to the outside, a route must be
+added in OpenShift: `http://kdb.kaas.kit.edu -> kdb-server:80`
+
+> The web interface is now available at http://kdb.kaas.kit.edu/kdb-admin.fcgi
+
+
+## Run processing
+
+### Application
+
+The setup for the run processing service is similar to the KDB server, with
+the container being created from a GitLab `Dockerfile` as well:
+https://nuserv.uni-muenster.de:8443/katrin-git/Dockerfiles/tree/inlineprocessing
+The app is created via the CLI, but manual changes are necessary later on:
+```
+oc new-app https://nuserv.uni-muenster.de:8443/katrin-git/Dockerfiles.git --name=run-processing
+```
+
+> The build fails because the branch name and user credentials are not defined.
+
+The build settings must be adapted before the image can be created.
+* Set the git branch name to `inlineprocessing`.
+* Use the source secret `katrin-gitlab` that was created before.
+
+#### Run environment
+
+When a container instance (pod) is created in OpenShift, the main script
+`/run-loop.sh` starts the main processing script `process-system.py`. It
+is executed in a continuous loop with a user-defined delay. The script
+is configured by the following environment variables that can be defined
+in the OpenShift configuration:
+* `PROCESS_SYSTEMS` defines one or more DAQ systems configured in the file
+ `ProcessingConfig.py`: `fpd`, `mos`, etc.
+* `PROCESS_FLAGS` defines additional options passed to the script, e.g.
+ `--pull` to automatically retrieve run files from configured DAQ machines.
+* `REFRESH_INTERVAL` defines the waiting time between consecutive executions.
+ Note that the `/run-loop.sh` script waits until `process-system.py` finished
+ before the next loop iteration is started, so the delay time is always
+ included regardless of how long the script takes to process all files.
+
+### Volumes
+
+The run processing stores files that need to be accessible by the KDB server
+application. Hence, the same persistent volume is used in this container:
+`katrin: data -> /mnt/katrin/data`
+
+To ensure that all processes can read/write correctly, the file permissions are
+relaxed (this can be done in an OpenShift terminal or remote shell):
+```
+mkdir -p /mnt/katrin/data/{inbox,archive,storage,workspace,logs,tmp}
+chown -R katrin: /mnt/katrin/data
+chmod -R ug+rw /mnt/katrin/data
+```
+
+### Config Maps
+
+Just like with the KDB server, a config map `run-processing-config` with
+mountpoint `/config` should be added, which defines the configuration of the
+processing script:
+* `ProcessingConfig.py` is the main config where the DAQ machines are defined
+ with their respective storage paths. The file also defines a list of
+ processing steps to be executed for each run file; these steps may have
+ to be adapted where necessary.
+* `datamanager.cfg` defines the interface to the KaLi web service. It must be
+ configured so that the KDB server instance from above is used:
+
+```
+url = http://kdb-server.katrin.svc/kdb-kali.fcgi
+user = katrin
+password = XXX
+timeout_seconds = 300
+cache_age_hours = -1
+```
+* `rsync-filter` is applied with the `rsync` command that copies run files
+ from the DAQ machines. It can be adapted to exclude certain directories,
+ e.g. old run files that do not need to be processed.
+* `log4cxx.properties` configures terminal/logfile output, see above.
+
+> Files in `/config` are symlinked to the respective files inside the container by `/run-loop.sh`.
+
+#### SSH keys
+
+A second config map `run-processing-ssh` is required to provide SSH keys that
+are used to authenticate remote connections to the DAQ machines. The map with
+mountpoint `/.ssh` should contain the files `id_dsa`, `id_dsa.pub` and
+`known_hosts` and must be adapted as necessary.
+
+> This assumes that the SSH credentials have been added to the respective machines beforehand!
+
+> The contents of `known_hosts` should be updated with the output of `ssh-keyscan` for the configured DAQ machines.
+
+### Notes
+
+The script `/run-loop.sh` pulls files from the DAQ machines and processes
+them automatically, newest first. Where necessary, run files can be copied
+manually (FPD example; adapt the options and `rsync-filter` file as required):
+```
+rsync -rltD --verbose --append-verify --partial --stats --compare-dest=/mnt/katrin/data/archive/FPDComm_530 --filter='. /opt/processing/system/rsync-filter' --log-file='/mnt/katrin/data/logs/rsync_fpd.log' katrin@192.168.110.76:/Volumes/DAQSTORAGE/data/ /mnt/katrin/data/inbox/FPDComm_530
+```
+
+If runs were not processed correctly, one can trigger manual reprocessing
+from an OpenShift terminal (with run numbers `START`, `END` as necessary):
+```
+./process-system.py -s fpd -r START END
+```
+
diff --git a/docs/troubleshooting.txt b/docs/troubleshooting.txt
index ae43c52..9fa6f91 100644
--- a/docs/troubleshooting.txt
+++ b/docs/troubleshooting.txt
@@ -134,6 +134,22 @@ etcd (and general operability)
pods (failed pods, rogue namespaces, etc...)
====
+ - The 'pods' scheduling may fail on one (or more) of the nodes after long waiting with 'oc logs' reporting
+ timeout. The 'oc describe' reports 'failed to create pod sandbox'. This can be caused by failure to clean-up
+ after terminated pod properly. It causes rogue network interfaces to remain in OpenVSwitch fabric.
+ * This can be determined by errors reported using 'ovs-vsctl show' or present in the log '/var/log/openvswitch/ovs-vswitchd.log'
+ which may quickly grow over 100MB quickly.
+ could not open network device vethb9de241f (No such device)
+ * The work-around is to delete rogue interfaces with
+ ovs-vsctl del-port br0 <iface>
+ More info:
+ ovs-ofctl -O OpenFlow13 show br0
+ ovs-ofctl -O OpenFlow13 dump-flows br0
+ This does not solve the problem, however. The new interfaces will get abandoned by OpenShift.
+ * The issue is discussed here:
+ https://bugzilla.redhat.com/show_bug.cgi?id=1518684
+ https://bugzilla.redhat.com/show_bug.cgi?id=1518912
+
- After crashes / upgrades some pods may end up in 'Error' state. This is quite often happen to
* kube-service-catalog/controller-manager
* openshift-template-service-broker/api-server
@@ -185,6 +201,8 @@ pods (failed pods, rogue namespaces, etc...)
docker ps -aq --no-trunc | xargs docker rm
+
+
Builds
======
- After changing storage for integrated docker registry, it may refuse builds with HTTP error 500. It is necessary
diff --git a/kickstart/README b/kickstart/README
deleted file mode 100644
index b686abc..0000000
--- a/kickstart/README
+++ /dev/null
@@ -1,9 +0,0 @@
-Actions
-=======
-1) We need to build a CD which will request kickstart file from the web server
-2) This CD should be programmed in IPMI interface
- => Currently placed in /virtual/images/centos74-ands.iso on ipepdvsrv2
-3) The installation is triggered by ipmi commands
-4) The web server runs a php script which detects connecting server and templates appropriate kickstart file
- => It is installed on ufo.kit.edu in /srv/www/htdocs/ands/kickstart
- => Detction is based on MAC address headers which are sent by CentOS CD
diff --git a/kickstart/autocd/build.sh b/kickstart/autocd/build.sh
deleted file mode 100755
index 289b4e2..0000000
--- a/kickstart/autocd/build.sh
+++ /dev/null
@@ -1,17 +0,0 @@
-#!/bin/bash
-
-if [ -n "bootcd/isolinux" ]; then
- echo "1) Copy content of official CentOS boot CD to bootcd directory"
- echo "2) Replace files provided in 'isolinux' directory"
- exit
-fi
-
-(
- cd bootcd
- chmod 664 isolinux/isolinux.bin
-
- rm -f ../centos74-ands.iso
- mkisofs -o ../centos74-ands.iso -b isolinux/isolinux.bin -c boot.cat -no-emul-boot -V 'CentOS 7 x86_64' -boot-load-size 4 -boot-info-table -R -J -v -T .
-)
-
-scp centos74-ands.iso root@192.168.26.134:/virtual/images/
diff --git a/kickstart/autocd/isolinux/isolinux.cfg b/kickstart/autocd/isolinux/isolinux.cfg
deleted file mode 100644
index e537309..0000000
--- a/kickstart/autocd/isolinux/isolinux.cfg
+++ /dev/null
@@ -1,76 +0,0 @@
-#ui vesamenu.c32
-#prompt 0
-#display boot.msg
-
-default auto
-timeout 10
-
-# Second port, 115200 baud
-#serial 1 115200
-
-label auto
- menu label ^Kickstart CentOS 7 for Ands
- kernel vmlinuz
- append initrd=initrd.img inst.stage2=hd:LABEL=CentOS\x207\x20x86_64 console=tty1 console=ttyS1,115200 earlyprint=serial,ttyS1,115200 ip=dhcp inst.vnc inst.vncpassword=ipepdv inst.ks=http://ufo.kit.edu/ands/kickstart/ inst.ks.sendsn inst.ks.sendmac
-# append initrd=initrd.img inst.stage2=hd:LABEL=CentOS\x207\x20x86_64 nomodeset text console=tty1 console=ttyS1,115200 earlyprint=serial,ttyS1,115200 ip=dhcp inst.vnc inst.vncpassword=ipepdv inst.ks=http://ufo.kit.edu/ands/kickstart/ inst.ks.sendsn inst.ks.sendmac
-
-label linux
- menu label ^Install CentOS 7
- kernel vmlinuz
- append initrd=initrd.img inst.stage2=hd:LABEL=CentOS\x207\x20x86_64 quiet
-
-label check
- menu label Test this ^media & install CentOS 7
- menu default
- kernel vmlinuz
- append initrd=initrd.img inst.stage2=hd:LABEL=CentOS\x207\x20x86_64 rd.live.check quiet
-
-menu separator # insert an empty line
-
-# utilities submenu
-menu begin ^Troubleshooting
- menu title Troubleshooting
-
-label vesa
- menu indent count 5
- menu label Install CentOS 7 in ^basic graphics mode
- text help
- Try this option out if you're having trouble installing
- CentOS 7.
- endtext
- kernel vmlinuz
- append initrd=initrd.img inst.stage2=hd:LABEL=CentOS\x207\x20x86_64 xdriver=vesa nomodeset quiet
-
-label rescue
- menu indent count 5
- menu label ^Rescue a CentOS system
- text help
- If the system will not boot, this lets you access files
- and edit config files to try to get it booting again.
- endtext
- kernel vmlinuz
- append initrd=initrd.img inst.stage2=hd:LABEL=CentOS\x207\x20x86_64 rescue quiet
-
-label memtest
- menu label Run a ^memory test
- text help
- If your system is having issues, a problem with your
- system's memory may be the cause. Use this utility to
- see if the memory is working correctly.
- endtext
- kernel memtest
-
-menu separator # insert an empty line
-
-label local
- menu label Boot from ^local drive
- localboot 0xffff
-
-menu separator # insert an empty line
-menu separator # insert an empty line
-
-label returntomain
- menu label Return to ^main menu
- menu exit
-
-menu end
diff --git a/kickstart/ipmi.sh b/kickstart/ipmi.sh
deleted file mode 100755
index 4571fb0..0000000
--- a/kickstart/ipmi.sh
+++ /dev/null
@@ -1,157 +0,0 @@
-user="ADMIN"
-pass='$ipepdv$'
-sleep=0.5
-
-function smipmi_cmd {
- echo "- Running: SMCIPMITool "
- echo "$@"
- /opt/smcipmi/SMCIPMITool "$@"
-}
-
-function smipmi {
- host=$1
- shift
- smipmi_cmd $host ADMIN '$ipepdv$' "$@"
-}
-
-
-function ipmi_cmd {
- echo -n "- Running: ipmitool "
- echo "$@"
- /usr/sbin/ipmitool "$@"
-}
-
-function ipmi {
- host=$1
- shift
- ipmi_cmd -H $host -U ADMIN -P '$ipepdv$' "$@"
-
-}
-
-function configure {
- host=$1
-
- ipmi $host chassis bootdev disk persistent cons_redirect=enable verbose=default
- sleep 0.5
-}
-
-function install {
- host=$1
-
-# Requires license
-# smipmi $host wsiso mount 192.168.26.134 /images/centos74-ands.iso
-
- ipmi $host power off
- sleep 10
- ipmi $host chassis bootdev cdrom
- sleep $sleep
- ipmi $host power on
-}
-
-function boot {
- host=$1
-
- configure $host
- ipmi $host power on
- sleep $sleep
-}
-
-function reboot {
- host=$1
-
- ipmi $host power off
- sleep 10
- ipmi $host power on
- sleep $sleep
-}
-
-function bios {
- host=$1
-
- ipmi $host power off
- sleep 10
- ipmi $host chassis bootdev bios
- sleep $sleep
- ipmi $host power on
-}
-
-
-
-function status {
- host=$1
-
- ipmi $host power status | grep "off" &> /dev/null
- if [ $? -ne 0 ]; then echo 1; else echo 0; fi
-}
-
-function wait_off {
- host=$1
-
- on=1
- while [ 1 ]; do
- on=$(status $host)
- [ "$on" -eq 0 ] && break
- echo " - $host still running..."
- sleep 5
- done
-}
-
-function cmd {
- ipmi "$@"
-}
-
-
-if [[ "$1" =~ ^[0-9\-]+$ ]]; then
- IFS='-' read -ra range <<< "$1"
-
- if [ -n "${range[1]}" ]; then
- servers=$(seq ${range[0]} ${range[1]})
- else
- servers=$(seq ${range[0]} ${range[0]})
- fi
- shift
-else
- servers=$(seq 1 3)
-fi
-iip=$(for i in $servers ; do echo "192.168.26.4$i" ; done)
-
-shift=1
-if [ -z "$1" ]; then
- echo "$0 [#-#] <config|install|reboot|boot|wait>"
- echo "$0 [#] <cmd>"
- exit
-elif [[ "$1" =~ config ]]; then
- action="configure"
-elif [[ "$1" =~ install ]]; then
- action="install"
-elif [[ "$1" =~ reboot ]]; then
- action="reboot"
-elif [[ "$1" =~ boot ]]; then
- action="boot"
-elif [[ "$1" =~ bios ]]; then
- action="bios"
-elif [[ "$1" =~ status ]]; then
- action="status"
-elif [[ "$1" =~ wait ]]; then
- action="wait_off"
-else
- shift=0
- action="cmd"
-fi
-
-if [ $shift -eq 1 ]; then
- shift
-fi
-
-for ip in $iip; do
- eval "$action" "$ip" "$@"
-done
-
-if [ $action = "install" ]; then
- sleep 30
- for ip in $iip; do
- wait_off "$ip" "$@"
- configure "$ip" "$@"
-# boot "$iip" "$@"
- done
-fi
diff --git a/kickstart/kickstart/authorized_keys b/kickstart/kickstart/authorized_keys
deleted file mode 100644
index f7b2526..0000000
--- a/kickstart/kickstart/authorized_keys
+++ /dev/null
@@ -1,4 +0,0 @@
-ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIGNetzktlBAcbsrdYEDim7x1JAbcN9n4W6BZKJsB3fim csa@styx
-ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAACAQD0r0MkNhJ3ooBNlz2GNfwGCULhmwzVyfWXlogS0E96dUUKSsvVQGbHHq+Ig8fkFNSAXIuIouAVN/IzWZesAXDd6oIsW880cJkfoGhJfFHa9gmOT9hgzybByhSxk+ovPWv2kqpGaqF8WbXlWwrjeMqRN5gbKwS4BcH3nb4Xt6ghOXAaCm1LTMFmmq8xfW0RWFFKm8tmORcUtEwtj4X1lUtMJ64arGLkyhIPu2PTbgeS6yPqkdws4xTczw9ngOyGw6RXb2Pyrs2igUAfRlrYK6vZrmzFn6yjPWY7WMkgO7BHR009fFAKzF8d4kVmGNkiuQE36IajEg+Idpo5a5sJc+WSok7oaxg3/blCY6rPkx3YHAdi7eTbqLea9j27gxJUzaIqu/loGU3gn/ddm+4vwsEiE/ZM368WOyetvrOz7sskjLCI+BhiKXbyTd8dFPDQQMgIrOlipqC3ikIDHKzruYzNGpplI5Yua0y1SvM3dVLumR0hCjUiw9Ew4M2eFGR2HNH7f4rq00bSMPhZmoIzPYhaAqS1QKKNx+TlFqfu30jckfHiBNFZ0bBwG+dNg+dG2GpWaOWLFKeoyV1xNJ/m8pCY4eRSdMUFtRH0Dq9vP5NoFQHzxwAREly/OZbNSAFWGYKKDX+ZGQR33qGfkicl0g/8Ul6zP5mjsRaHiIWvL4W3wQ== csa@styx
-ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQDIsJNm47UfzMUWMPAereBiqHQwvtKowirnp0pzCILSRztEWkKurnwyQmV9r4ZXibFS0dgN8aMgOoingLZOJnDUeelBMYZraEphAaQ3zkkYroZo7oJ9Ouuv5xYrhlKNKtrRDBpVuEcmmfNesRFB9Snb9QR3H90AlKh9KJmJcRPPGboCgyeGqTPVu6hJBwePqn4d8DPSGLCl8YP6XXhMMfVTxC4BBuKFvfIrD3yeyvnBVCIael6swytss6IK28BF3z/lahA2uNjhiy0rNKx2WHAAiUxaCRyLrmnJpCblvjlJ9OkwqDWrumVC8K0bVXeoyvZi2iICS2AZoKc0hREpJKo7 csa@gpg
-ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQDXdXQHy2GMS467G1yorHyMnjj6Wo3jVYqVLlN3pYBRGkPZOhyaAFXsEOH0roeg4h+0cFgha+d9+naB6d0dKhfw131wb05yZih+H6+eWOTpl1u+gkkOdMmwyKd7ymSSGFueDGQyVAx3WDPgoUgz9Y2SJsKe0cJPOON9kdPFwGN7tfHt3GJ2VoqwUnuFvQ2deCTPzvir2kC4RyuhasbZtY69/rOpxo99eZFMgChG/ftqeLOl0xyJ2If5u9sZeyR5i9ytvHsq59FGF6BpKJtLmC0wkqBvImUYQWXYmClT5Lo7vZfkbJnWjeuQH5JJ75mcl4XTtdjx+QIK8aSeznclLDlN Android
diff --git a/kickstart/kickstart/authorized_keys.csa b/kickstart/kickstart/authorized_keys.csa
deleted file mode 100644
index f7b2526..0000000
--- a/kickstart/kickstart/authorized_keys.csa
+++ /dev/null
@@ -1,4 +0,0 @@
-ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIGNetzktlBAcbsrdYEDim7x1JAbcN9n4W6BZKJsB3fim csa@styx
-ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAACAQD0r0MkNhJ3ooBNlz2GNfwGCULhmwzVyfWXlogS0E96dUUKSsvVQGbHHq+Ig8fkFNSAXIuIouAVN/IzWZesAXDd6oIsW880cJkfoGhJfFHa9gmOT9hgzybByhSxk+ovPWv2kqpGaqF8WbXlWwrjeMqRN5gbKwS4BcH3nb4Xt6ghOXAaCm1LTMFmmq8xfW0RWFFKm8tmORcUtEwtj4X1lUtMJ64arGLkyhIPu2PTbgeS6yPqkdws4xTczw9ngOyGw6RXb2Pyrs2igUAfRlrYK6vZrmzFn6yjPWY7WMkgO7BHR009fFAKzF8d4kVmGNkiuQE36IajEg+Idpo5a5sJc+WSok7oaxg3/blCY6rPkx3YHAdi7eTbqLea9j27gxJUzaIqu/loGU3gn/ddm+4vwsEiE/ZM368WOyetvrOz7sskjLCI+BhiKXbyTd8dFPDQQMgIrOlipqC3ikIDHKzruYzNGpplI5Yua0y1SvM3dVLumR0hCjUiw9Ew4M2eFGR2HNH7f4rq00bSMPhZmoIzPYhaAqS1QKKNx+TlFqfu30jckfHiBNFZ0bBwG+dNg+dG2GpWaOWLFKeoyV1xNJ/m8pCY4eRSdMUFtRH0Dq9vP5NoFQHzxwAREly/OZbNSAFWGYKKDX+ZGQR33qGfkicl0g/8Ul6zP5mjsRaHiIWvL4W3wQ== csa@styx
-ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQDIsJNm47UfzMUWMPAereBiqHQwvtKowirnp0pzCILSRztEWkKurnwyQmV9r4ZXibFS0dgN8aMgOoingLZOJnDUeelBMYZraEphAaQ3zkkYroZo7oJ9Ouuv5xYrhlKNKtrRDBpVuEcmmfNesRFB9Snb9QR3H90AlKh9KJmJcRPPGboCgyeGqTPVu6hJBwePqn4d8DPSGLCl8YP6XXhMMfVTxC4BBuKFvfIrD3yeyvnBVCIael6swytss6IK28BF3z/lahA2uNjhiy0rNKx2WHAAiUxaCRyLrmnJpCblvjlJ9OkwqDWrumVC8K0bVXeoyvZi2iICS2AZoKc0hREpJKo7 csa@gpg
-ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQDXdXQHy2GMS467G1yorHyMnjj6Wo3jVYqVLlN3pYBRGkPZOhyaAFXsEOH0roeg4h+0cFgha+d9+naB6d0dKhfw131wb05yZih+H6+eWOTpl1u+gkkOdMmwyKd7ymSSGFueDGQyVAx3WDPgoUgz9Y2SJsKe0cJPOON9kdPFwGN7tfHt3GJ2VoqwUnuFvQ2deCTPzvir2kC4RyuhasbZtY69/rOpxo99eZFMgChG/ftqeLOl0xyJ2If5u9sZeyR5i9ytvHsq59FGF6BpKJtLmC0wkqBvImUYQWXYmClT5Lo7vZfkbJnWjeuQH5JJ75mcl4XTtdjx+QIK8aSeznclLDlN Android
diff --git a/kickstart/kickstart/index.php b/kickstart/kickstart/index.php
deleted file mode 100644
index 9336fb8..0000000
--- a/kickstart/kickstart/index.php
+++ /dev/null
@@ -1,182 +0,0 @@
-<?php
-
-#$VERBOSE = 1;
-
-# disks will be synchronized with k3
-$KATRIN_SERVERS = array(
- "ks" => "ipekatrin-v4.ks",
- "domain" => "ipe.kit.edu",
- "netmask" => "255.255.254.0",
- "gw" => "141.52.64.207",
- "ns" => "141.52.3.3,141.52.8.18",
- "time" => "141.52.8.18",
- "sysdisks" => "sdb,sdc",
- "append_sol" => "console=tty1 console=ttyS1,115200 earlyprint=serial,ttyS1,115200",
- "raid" => "RAID1",
- "size" => "80000",
- "bootsize" => "2048",
- "ethdev" => "enp3s0f1",
- "ethdev2" => "enp3s0f0",
-);
-
-$SERVER_LIST = array(
- "ipecsavm" => array_merge($KATRIN_SERVERS, array(
- "ks" => "ipekatrin-v4.ks",
- "macs" => array("66:66:66:13:13:00"),
- "ip" => "192.168.26.254",
- "netmask" => "255.255.255.0",
- "gw" => "192.168.26.117",
- "sysdisks" => "sda,sdb",
- "raid" => "RAID0",
- "size" => "60000",
- "ethdev" => "link",
- )),
- "ipechilinga2" => array_merge($KATRIN_SERVERS, array(
- "domain" => "ka.fzk.de",
- "macs" => array("48:5b:39:75:fe:ec"),
- "headers" => array(
- "REMOTE_ADDR" => array("141.52.64.104")
- )
- )),
- "ipekatrin1" => array_merge($KATRIN_SERVERS, array(
- "macs" => array("0c:c4:7a:de:f1:08", "0c:c4:7a:de:f1:09")
- )),
- "ipekatrin2" => array_merge($KATRIN_SERVERS, array(
- "macs" => array("0c:c4:7a:de:f0:e6", "0c:c4:7a:de:f0:e7")
- )),
- "ipekatrin3" => array_merge($KATRIN_SERVERS, array(
- "macs" => array("0c:c4:7a:a8:81:3e", "0c:c4:7a:a8:81:3f"),
- "ethdev" => "eno2",
- "ethdev2" => "eno1",
- ))
-);
-
-
-function get_server($srvid) {
- global $SERVER_LIST;
-
- $server = $SERVER_LIST[$srvid];
-
- if (!isset($server["fqdn"]))
- $server["fqdn"] = "{$srvid}.{$server['domain']}";
-
- if (!isset($server["ip"]))
- $server["ip"] = gethostbyname($server["fqdn"]);
-
- $disks = explode(",", $server["sysdisks"]);
- if (!isset($server["bootdisk"]))
- $server["bootdisk"] = $disks[0];
-
- if (!isset($server["disk1"]))
- $server["disk1"] = $disks[0];
-
- if ((isset($disks[1]))&&(!isset($server["disk2"])))
- $server["disk2"] = $disks[1];
-
-
- unset($server["macs"]);
- unset($server["headers"]);
-
- return $server;
-}
-
-function find_mac($macs, $mac_header) {
- if (!is_array($macs)) $macs = array($macs);
-
- foreach ($macs as $mac) {
- if (preg_match("/$mac/", $mac_header))
- return true;
- }
- return false;
-}
-
-function find_server_by_mac($mac_header) {
- global $SERVER_LIST;
-
- foreach ($SERVER_LIST as $srvid => $server) {
- if (find_mac($server['macs'], $mac_header))
- return get_server($srvid);
- }
- return false;
-}
-
-function find_server_by_header($http_header, $value) {
- global $SERVER_LIST;
-
- foreach ($SERVER_LIST as $srvid => $server) {
- if ((is_array($server["headers"]))&&(isset($server["headers"][$http_header]))) {
- $expected = $server["headers"][$http_header];
- if (!is_array($expected)) $expected = array($expected);
- foreach ($expected as $re) {
- if (preg_match("/$re/", $value))
- return get_server($srvid);
- }
- }
- }
- return false;
-}
-
-function find_server() {
- global $_SERVER;
-
- $headers = getallheaders();
- for ($i = 0; $i < 10; $i++) {
- $if = "X-RHN-Provisioning-MAC-$i";
- if (!isset($headers[$if])) break;
-
- $server = find_server_by_mac($headers[$if]);
- if ($server) return $server;
- }
-
- foreach ($_SERVER as $header => $value) {
- $server = find_server_by_header($header, $value);
- if ($server) return $server;
- }
-
- return false;
-}
-
-
-
-
-#echo "Request from: " . $_SERVER["REMOTE_ADDR"];
-
-$server = find_server();
-if (!$server) {
- $f = fopen("/srv/www/htdocs/ands/logs/kickstart-new.log", "a+");
- if ($f) {
- fwrite($f, print_r($_SERVER, true));
- fwrite($f, print_r(getallheaders(), true));
- fclose($f);
- }
- return;
-}
-
-$ks = file_get_contents($server["ks"]);
-
-$patterns=array(); $values=array();
-foreach ($server as $key => $val) {
- array_push($patterns, "/@" . strtoupper($key) . "@/");
- array_push($values, $val);
-}
-
-$ks = preg_replace($patterns, $values, $ks);
-
-if ($VERBOSE) {
- $f = fopen("/srv/www/htdocs/ands/logs/kickstart.log", "a+");
- if ($f) {
- fwrite($f, "-----------------------------------------------------\n");
- fwrite($f, print_r($server, true));
- fwrite($f, print_r($_SERVER, true));
- fwrite($f, print_r(getallheaders(), true));
- fwrite($f, "-----------------------------------------------------\n");
- fwrite($f, $ks);
- fwrite($f, "=====================================================\n");
- fclose($f);
- }
-}
-
-header("Content-type: text/plain");
-echo $ks;
-
-?>
diff --git a/kickstart/kickstart/ipekatrin-v4.ks b/kickstart/kickstart/ipekatrin-v4.ks
deleted file mode 100644
index ba1a30a..0000000
--- a/kickstart/kickstart/ipekatrin-v4.ks
+++ /dev/null
@@ -1,100 +0,0 @@
-#version=DEVEL
-
-# System authorization information
-auth --enableshadow --passalgo=sha512
-
-# Use CDROM installation media
-cdrom
-
-# Use graphical install (graphical is enforce by vnc requested at kernel args)
-#text
-graphical
-
-# Run the Setup Agent on first boot
-firstboot --enable
-ignoredisk --only-use=@SYSDISKS@
-# Keyboard layouts
-keyboard --vckeymap=us --xlayouts='us'
-# System language
-lang en_US.UTF-8
-
-# Network information (device=link signifies first device link active)
-network --device=@ETHDEV@ --bootproto=static --ip=@IP@ --netmask=@NETMASK@ --gateway=@GW@ --nameserver=@NS@ --noipv6 --onboot=on --activate
-#network --device=@ETHDEV2@ --bootproto=static --ip=@IP@ --netmask=@NETMASK@ --gateway=@GW@ --nameserver=@NS@ --noipv6 --onboot=off --activate
-#network --bootproto=dhcp --device=eth0 --ipv6=auto --activate
-network --hostname=@FQDN@
-
-
-# Partition clearing information
-clearpart --all --drives=@SYSDISKS@
-zerombr
-
-# System bootloader configuration
-bootloader --location=mbr --driveorder=@SYSDISKS@ --boot-drive=@BOOTDISK@ --append=" crashkernel=auto @APPEND_SOL@"
-
-#autopart --type=lvm
-#reqpart --add-boot
-part raid.01 --ondisk=@DISK1@ --asprimary --size @BOOTSIZE@
-part raid.02 --ondisk=@DISK2@ --asprimary --size @BOOTSIZE@
-part swap --ondisk=@DISK1@ --asprimary --fstype=swap --recommended
-part swap --ondisk=@DISK2@ --asprimary --fstype=swap --recommended
-part raid.03 --ondisk=@DISK1@ --asprimary --size @SIZE@ --grow
-part raid.04 --ondisk=@DISK2@ --asprimary --size @SIZE@ --grow
-raid /boot --level=@RAID@ --device md0 raid.01 raid.02 --fstype=ext4
-raid pv.01 --level=@RAID@ --device=md1 raid.03 raid.04
-volgroup sysvg pv.01
-logvol / --vgname=sysvg --size=@SIZE@ --name=lv_root --fstype=ext4
-
-# Root password (new)
-rootpw --iscrypted $6$ihAbktYN$T36KRAmi8ccjNrE5Y0gEl11Rb/dl3GjemejAJyHVzrAL51/st7aMZ0dqnMIkhubX/gUcPe5LdTlJODC9D/60h0
-# Root passowrd (old)
-#rootpw --iscrypted $6$ioKrEQSxzYypx2HZ$jiynrl6knbmhbL066k.HjmxcwvQwBsT53LPlp2fRdkg2E1E7Gy4gwxaZ0m86rbD6q4dTaWdYfKhDVSij6N1Y7.
-
-# System services
-services --enabled="chronyd"
-# System timezone
-timezone Europe/Berlin --isUtc --ntpservers=@TIME@
-user --groups=wheel --name=csa --gecos="Suren A. Chilingaryan"
-
-# SELinux configuration
-#selinux --disabled
-
-# Do not configure the X Window System
-skipx
-
-install
-poweroff
-
-
-%packages
-@^minimal
-@core
-chrony
-kexec-tools
-curl
-%end
-
-%addon com_redhat_kdump --enable --reserve-mb='auto'
-%end
-
-%anaconda
-pwpolicy root --minlen=6 --minquality=1 --notstrict --nochanges --notempty
-pwpolicy user --minlen=6 --minquality=1 --notstrict --nochanges --emptyok
-pwpolicy luks --minlen=6 --minquality=1 --notstrict --nochanges --notempty
-%end
-
-
-%post --log=/var/log/ks01.log
-yum install -y unzip
-
-mkdir /root/.ssh
-chmod 0700 /root/.ssh
-curl http://ufo.kit.edu/ands/kickstart/authorized_keys -o /root/.ssh/authorized_keys
-chmod 0600 /root/.ssh/authorized_keys
-
-mkdir /home/csa/.ssh
-chmod 0700 /home/csa/.ssh
-curl http://ufo.kit.edu/ands/kickstart/authorized_keys.csa -o /home/csa/.ssh/authorized_keys
-chown -R csa:user /home/csa/.ssh
-chmod 0600 /home/csa/.ssh/authorized_keys
-%end
diff --git a/kickstart/testvm/centos.sh b/kickstart/testvm/centos.sh
deleted file mode 100755
index a437ed8..0000000
--- a/kickstart/testvm/centos.sh
+++ /dev/null
@@ -1,9 +0,0 @@
-#! /bin/bash
-
-sudo modprobe kvm-intel
-sudo /etc/init.d/vde start
-
-[ ! -f 1.qcow2 ] && qemu-img create -f qcow2 1.qcow2 "80G"
-[ ! -f 2.qcow2 ] && qemu-img create -f qcow2 2.qcow2 "80G"
-
-qemu-system-x86_64 -enable-kvm -display sdl -hda 1.qcow2 -hdb 2.qcow2 -m 2048 -net nic,macaddr=66:66:66:13:13:00 -net vde,sock=/var/run/vde.ctl -cdrom centos74-ands.iso -boot order=d
diff --git a/opts.sh b/opts.sh
index d44398f..d87b89d 100644
--- a/opts.sh
+++ b/opts.sh
@@ -47,6 +47,7 @@ Actions:
users - configure user roles & passwords
storage - reconfigures Gluster and OpenShift volumes
projects - reconfigures OpenShift resources if necessary
+ new_project <name> - add a new OpenShift namespace
project <name> - reconfigures a single OpenShift namespace
project_groups <n> - reconfigures fs groups for a single OpenShift namespace (required for Ganesha)
apps <prj> [app] - only re-generates templates for the specific namespaces (or even only specific application)
diff --git a/playbooks/openshift-add-project.yml b/playbooks/openshift-add-project.yml
new file mode 100644
index 0000000..5a027f1
--- /dev/null
+++ b/playbooks/openshift-add-project.yml
@@ -0,0 +1,31 @@
+- import_playbook: maintain.yml
+
+- name: Configure users
+ hosts: masters
+ roles:
+ - { role: ands_openshift, subrole: projects }
+ - { role: ands_openshift, subrole: users }
+ - { role: ands_openshift, subrole: security }
+ - { role: ands_openshift, subrole: storage }
+
+
+- name: Configure per-node {{ kaas_single_project }} project storage
+ hosts: ands_storage_servers, ands_new_storage_servers
+ roles:
+ - { role: ands_kaas, subrole: storage }
+ vars:
+ kaas_projects: "{{ ands_openshift_projects.keys() }}"
+ kaas_openshift_volumes: "{{ ands_openshift_volumes }}"
+ kaas_storage_types: [ 'host' ]
+ kaas_single_project: "{{ ands_configure_project }}"
+
+
+- name: "Configure project {{ kaas_single_project }}"
+ hosts: masters
+ roles:
+ - { role: ands_kaas }
+ vars:
+ kaas_openshift_volumes: "{{ ands_openshift_volumes }}"
+ kaas_projects: "{{ ands_openshift_projects.keys() }}"
+ kaas_single_project: "{{ ands_configure_project }}"
+
diff --git a/roles/ands_kaas/tasks/do_project.yml b/roles/ands_kaas/tasks/do_project.yml
index e64e9e3..af078cd 100644
--- a/roles/ands_kaas/tasks/do_project.yml
+++ b/roles/ands_kaas/tasks/do_project.yml
@@ -7,7 +7,8 @@
run_once: true
vars:
script: "{{ kaas_project_script }}"
- when: kaas_project_script is defined
+ when:
+ - kaas_project_script != ands_none
- block:
- name: Configure storage
@@ -23,5 +24,5 @@
include_tasks: do_apps.yml
run_once: true
when:
- - kaas_project_script is undefined
+ - kaas_project_script == ands_none
diff --git a/roles/ands_kaas/tasks/project.yml b/roles/ands_kaas/tasks/project.yml
index 2c05afd..0376477 100644
--- a/roles/ands_kaas/tasks/project.yml
+++ b/roles/ands_kaas/tasks/project.yml
@@ -63,4 +63,4 @@
kaas_project_gids: "{{ kaas_project_config.gids | default(kaas_openshift_gids) }}"
kaas_project_uids: "{{ kaas_project_config.uids | default(kaas_openshift_uids) }}"
kaas_blockvol_info: "{{ block_info }}"
- kaas_project_script: "{{ kaas_project_config.oc | default(omit) }}"
+ kaas_project_script: "{{ kaas_project_config.oc | default(ands_none) }}"
diff --git a/roles/ands_monitor/tasks/main.yml b/roles/ands_monitor/tasks/main.yml
index ac70d28..8cac4ea 100644
--- a/roles/ands_monitor/tasks/main.yml
+++ b/roles/ands_monitor/tasks/main.yml
@@ -1,3 +1,8 @@
+- name: Install monitoring applications
+ package: name={{item}} state=present
+ with_items:
+ - sysstat
+
- name: Create scripts directory
file: path="{{ ands_script_path }}" state=directory
@@ -7,3 +12,11 @@
script_name: "{{ item | basename | regex_replace('\\.j2','') }}"
with_fileglob:
- "{{ role_path }}/templates/scripts/*.j2"
+
+
+- name: "Deploy cron jobs"
+ template: src="{{ item | quote }}" dest="/etc/cron.d/{{ cron_name }}" owner=root group=root mode=0644
+ vars:
+ cron_name: "{{ item | basename | regex_replace('\\.j2','') }}"
+ with_fileglob:
+ - "{{ role_path }}/templates/cron/*.j2"
diff --git a/roles/ands_monitor/templates/cron/maintain.j2 b/roles/ands_monitor/templates/cron/maintain.j2
new file mode 100644
index 0000000..2c3ce9c
--- /dev/null
+++ b/roles/ands_monitor/templates/cron/maintain.j2
@@ -0,0 +1,4 @@
+SHELL=/bin/bash
+PATH=/sbin:/bin:/usr/sbin:/usr/bin
+MAILTO=csa-darkserv@suren.me
+33 */4 * * * root {{ ands_script_path }}/maintain.sh
diff --git a/roles/ands_monitor/templates/scripts/check_server_status.sh.j2 b/roles/ands_monitor/templates/scripts/check_server_status.sh.j2
index caa63ce..b02f031 100755
--- a/roles/ands_monitor/templates/scripts/check_server_status.sh.j2
+++ b/roles/ands_monitor/templates/scripts/check_server_status.sh.j2
@@ -32,3 +32,14 @@ if [ -z "$disks" -o "$disks" -ne 0 ]; then
echo "Not all disks are online:"
/opt/MegaRAID/storcli/storcli64 /c0 show | grep -P "(HDD|SSD)" | grep -v "Onln"
fi
+
+ifaces=$(ovs-vsctl show | grep -oP "could not open network device\s*\Kveth[a-f0-9]+" | wc -l)
+if [ "$ifaces" -gt 50 ]; then
+ echo "Too many rogue interfaces ($ifaces) is registered on OpenVSwitch bridge. It could introduce large delays in pod scheduling..."
+fi
+
+#Check various known problems
+vssize=$(du -sm /var/log/openvswitch/ovs-vswitchd.log | cut -f 1)
+if [ "$vssize" -gt 128 ]; then
+ echo "Current OpenVSwitch log is over $vssize MB. It could indicate some severe problems in pod networking..."
+fi
diff --git a/roles/ands_monitor/templates/scripts/check_uptime_status.sh.j2 b/roles/ands_monitor/templates/scripts/check_uptime_status.sh.j2
index 0602fcb..7acac5f 100755
--- a/roles/ands_monitor/templates/scripts/check_uptime_status.sh.j2
+++ b/roles/ands_monitor/templates/scripts/check_uptime_status.sh.j2
@@ -2,7 +2,14 @@
up=$(uptime | cut -d ' ' -f 4- | cut -d ',' -f 1 | sed -re 's/^\s*//')
load=$(uptime | cut -d ' ' -f 4- | cut -d ',' -f 4- | cut -d ':' -f 2 | cut -d ',' -f 3 | sed -re 's/^\s*//')
+#pods=$(oc get pods --all-namespaces -o wide | grep `hostname` | wc -l)
+containers=$(docker ps -q | wc -l)
+#processes=$(ps xa --no-headers | wc -l)
+mem=$(free -t -g | grep "Mem:" | sed -re 's/\s+/ /g' | cut -d ' ' -f 3)
+iops=$(iostat -d | grep -E "^sd" | awk '{s+=$2} END {print s}' | cut -d '.' -f 1)
+net=$(ifstat -n; sleep 0.1; ifstat | grep -E "^(enp|ib)" | awk '{s+=$4+$5} END {print s}'); net=$((net / 100))
disks=$(/opt/MegaRAID/storcli/storcli64 /c0 show | grep -P "(HDD|SSD)" | grep "Onln" | wc -l)
data=`df -lh /mnt/ands | grep -vi Filesystem | sed -e 's/[[:space:]]\+/ /g' | cut -d ' ' -f 4`
-echo -n "1 Up $up \${color gray}/ $disks disks, $data free, load: $load"
+#echo -n "1 Up $up \${color gray}/ $disks disks, $data free, load: $load, pods: $pods"
+echo -en "1 $up\${color gray}, ${disks}/${data}, $(printf %3u ${containers}) c - $(printf %4.1f ${load}), $(printf %3u ${mem}) GB, $(printf %4u ${iops}) IOPS, $(printf %3u ${net}) MB/s"
diff --git a/roles/ands_monitor/templates/scripts/clean_rogue_interfaces.sh.j2 b/roles/ands_monitor/templates/scripts/clean_rogue_interfaces.sh.j2
new file mode 100755
index 0000000..c04ce60
--- /dev/null
+++ b/roles/ands_monitor/templates/scripts/clean_rogue_interfaces.sh.j2
@@ -0,0 +1,18 @@
+#!/bin/bash
+
+ifaces=$(ovs-vsctl show | grep -oP "could not open network device\s*\Kveth[a-f0-9]+")
+[ $? -eq 0 ] || exit
+
+#Find bridge
+#ovs-vsctl list-br
+
+for iface in $ifaces; do
+# echo "$iface"
+
+# Verify that interface is not active
+ ip link show | grep $iface &> /dev/null
+ [ $? -eq 0 ] && continue
+
+ echo "Removing: $iface"
+ ovs-vsctl del-port br0 $iface
+done
diff --git a/roles/ands_monitor/templates/scripts/list_containers.sh.j2 b/roles/ands_monitor/templates/scripts/list_containers.sh.j2
new file mode 100755
index 0000000..bac2884
--- /dev/null
+++ b/roles/ands_monitor/templates/scripts/list_containers.sh.j2
@@ -0,0 +1,3 @@
+#! /bin/bash
+
+kubectl get pods --all-namespaces -o jsonpath='{range .items[*]}{@.metadata.name}{" "}{@.spec.containers[*].image}{" - "}{@.status.containerStatuses[*].containerID}{"\n"}{end}'
diff --git a/roles/ands_monitor/templates/scripts/maintain.sh.j2 b/roles/ands_monitor/templates/scripts/maintain.sh.j2
new file mode 100755
index 0000000..45c9513
--- /dev/null
+++ b/roles/ands_monitor/templates/scripts/maintain.sh.j2
@@ -0,0 +1,8 @@
+#! /bin/bash
+
+# Left-over network interfaces on the OpenVSwitch bridge after pod termination
+ifaces=$(ovs-vsctl show | grep -oP "could not open network device\s*\Kveth[a-f0-9]+" | wc -l)
+if [ $ifaces -gt 25 ]; then
+ echo "Cleaning rogue interfaces ($ifaces) on $(hostname)"
+ {{ ands_script_path }}/clean_rogue_interfaces.sh > /dev/null
+fi
diff --git a/setup.sh b/setup.sh
index 090bd7d..f4271f8 100755
--- a/setup.sh
+++ b/setup.sh
@@ -49,6 +49,12 @@ case "$action" in
projects)
apply playbooks/openshift-setup-projects.yml "$@" || exit 1
;;
+ add_project)
+ project=$1
+ shift
+ [ -n "$project" ] || { usage 'project name should be specified...' ; exit 1; }
+ apply playbooks/openshift-add-project.yml --extra-vars "ands_configure_project=$project" "$@" || exit 1
+ ;;
project)
project=$1
shift
diff --git a/setup/configs/openshift.yml b/setup/configs/openshift.yml
index a4024ae..5637269 100644
--- a/setup/configs/openshift.yml
+++ b/setup/configs/openshift.yml
@@ -3,6 +3,7 @@ ands_openshift_projects:
kaas: KaaS router and common resources
katrin: KArlsruhe TRItium Neutrino
adei: ADEI
+ adai: ADAI
bora: Build Once Run Always
web: Web Sites
mon: OpenShift monitoring
@@ -14,6 +15,7 @@ ands_openshift_users:
csa: { name: "Suren A. Chilingaryan", email: "csa@suren.me", uid: "1001", shell: "/bin/bash" }
kopmann: { name: "Andreas Kopmann", email: "kopmann@kit.edu" }
ntj: { name: "Nicholas Tan Jerome", email: "nicholas.jerome@kit.edu" }
+ jonasteufel: { name: "Jonas Teufel", email: "jonseb1998@gmail.com" }
ands_openshift_roles:
cluster-admin: csa
@@ -22,7 +24,8 @@ ands_openshift_roles:
adei/admin: csa
adei/view: pdv, kopmann
adei/kaas-maintain: pdv, kopmann
+ adai/admin: csa
bora/admin: ntj
- web/admin: kopmann
+ web/admin: kopmann, jonasteufel
mon/admin: csa
test/admin: csa, ntj, kopmann, katrin
diff --git a/setup/configs/security.yml b/setup/configs/security.yml
index 22784b3..8e418f9 100644
--- a/setup/configs/security.yml
+++ b/setup/configs/security.yml
@@ -20,6 +20,7 @@ ands_openshift_gid_ranges:
kaas: "4000/10"
katrin: "5000/10"
adei: "6000/10"
+ adai: "6050/10"
bora: "6100/10"
web: "6200/10"
mon: "7000/10"
diff --git a/setup/users/htpasswd b/setup/users/htpasswd
index cf0f1e6..b7165a4 100644
--- a/setup/users/htpasswd
+++ b/setup/users/htpasswd
@@ -1,5 +1,6 @@
pdv:$apr1$ACvj6uUa$Nm1Vq8hZq3RzTtaYpAHv01
csa:$apr1$IqEwdnzy$UAdd8ZSFnXommBbj29w3c0
-katrin:$apr1$/hxgbxC4$/MxeHtIYvAJcIQFR5Jz0E0
-ntj:$apr1$un8GkxMv$VZ36KeB90qwMMVpvHxOj8.
+katrin:$apr1$AQIm74Ae$CJWEzUK6jEYSsk28DQ9du0
+ntj:$apr1$G5/ThWdp$kFLsj/hO9jIYYP.Zab9kC/
kopmann:CZFk3ASLX0Vq6
+jonasteufel:$apr1$2dsiiZ1p$Us/5i8DEt9fxeliGy7L6h/