summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorScott Dodson <sdodson@redhat.com>2017-07-18 15:10:37 -0400
committerGitHub <noreply@github.com>2017-07-18 15:10:37 -0400
commit8e2be54c6f2f55368f33667a68a5693dc75b1027 (patch)
tree0f63035787dce728f9fa994a2dad78cff5bb9f0c
parent742203529902ba278c213e326f81f667304b9625 (diff)
parent5a94fe5b074d01a3b16db8a05c47c31e484e5ebe (diff)
downloadopenshift-8e2be54c6f2f55368f33667a68a5693dc75b1027.tar.gz
openshift-8e2be54c6f2f55368f33667a68a5693dc75b1027.tar.bz2
openshift-8e2be54c6f2f55368f33667a68a5693dc75b1027.tar.xz
openshift-8e2be54c6f2f55368f33667a68a5693dc75b1027.zip
Merge pull request #4698 from sdodson/service-retries
Service retries
-rw-r--r--playbooks/adhoc/uninstall.yml4
-rw-r--r--playbooks/common/openshift-cluster/upgrades/docker/docker_upgrade.yml4
-rw-r--r--playbooks/common/openshift-cluster/upgrades/docker/restart.yml4
-rw-r--r--playbooks/common/openshift-cluster/upgrades/docker/upgrade.yml8
-rw-r--r--playbooks/common/openshift-cluster/upgrades/upgrade_control_plane.yml4
-rw-r--r--playbooks/common/openshift-cluster/upgrades/upgrade_nodes.yml4
-rw-r--r--playbooks/common/openshift-node/restart.yml4
-rw-r--r--roles/calico/handlers/main.yml4
-rw-r--r--roles/contiv/tasks/netplugin.yml4
-rw-r--r--roles/docker/handlers/main.yml3
-rw-r--r--roles/docker/tasks/package_docker.yml7
-rw-r--r--roles/docker/tasks/systemcontainer_docker.yml12
-rw-r--r--roles/flannel/handlers/main.yml4
-rw-r--r--roles/openshift_node/handlers/main.yml13
-rw-r--r--roles/openshift_node/tasks/main.yml16
-rw-r--r--roles/openshift_node_certificates/handlers/main.yml4
-rw-r--r--roles/openshift_node_upgrade/README.md5
-rw-r--r--roles/openshift_node_upgrade/handlers/main.yml20
-rw-r--r--roles/openshift_node_upgrade/tasks/docker/upgrade.yml8
-rw-r--r--roles/openshift_node_upgrade/tasks/restart.yml2
20 files changed, 121 insertions, 13 deletions
diff --git a/playbooks/adhoc/uninstall.yml b/playbooks/adhoc/uninstall.yml
index a1f541712..58b3a7835 100644
--- a/playbooks/adhoc/uninstall.yml
+++ b/playbooks/adhoc/uninstall.yml
@@ -325,6 +325,10 @@
service: name=docker state=restarted
failed_when: false
when: not (container_engine | changed)
+ register: l_docker_restart_docker_in_pb_result
+ until: not l_docker_restart_docker_in_pb_result | failed
+ retries: 3
+ delay: 30
- hosts: masters
become: yes
diff --git a/playbooks/common/openshift-cluster/upgrades/docker/docker_upgrade.yml b/playbooks/common/openshift-cluster/upgrades/docker/docker_upgrade.yml
index 07db071ce..1ed9041d4 100644
--- a/playbooks/common/openshift-cluster/upgrades/docker/docker_upgrade.yml
+++ b/playbooks/common/openshift-cluster/upgrades/docker/docker_upgrade.yml
@@ -55,6 +55,10 @@
{{ openshift.common.admin_binary }} drain {{ openshift.node.nodename }} --force --delete-local-data --ignore-daemonsets
delegate_to: "{{ groups.oo_first_master.0 }}"
when: l_docker_upgrade is defined and l_docker_upgrade | bool and inventory_hostname in groups.oo_nodes_to_upgrade
+ register: l_docker_upgrade_drain_result
+ until: not l_docker_upgrade_drain_result | failed
+ retries: 60
+ delay: 60
- include: upgrade.yml
when: l_docker_upgrade is defined and l_docker_upgrade | bool
diff --git a/playbooks/common/openshift-cluster/upgrades/docker/restart.yml b/playbooks/common/openshift-cluster/upgrades/docker/restart.yml
index 96c729d79..13313377e 100644
--- a/playbooks/common/openshift-cluster/upgrades/docker/restart.yml
+++ b/playbooks/common/openshift-cluster/upgrades/docker/restart.yml
@@ -1,6 +1,10 @@
---
- name: Restart docker
service: name=docker state=restarted
+ register: l_docker_restart_docker_in_upgrade_result
+ until: not l_docker_restart_docker_in_upgrade_result | failed
+ retries: 3
+ delay: 30
- name: Update docker facts
openshift_facts:
diff --git a/playbooks/common/openshift-cluster/upgrades/docker/upgrade.yml b/playbooks/common/openshift-cluster/upgrades/docker/upgrade.yml
index 17f8fc6e9..35d000e49 100644
--- a/playbooks/common/openshift-cluster/upgrades/docker/upgrade.yml
+++ b/playbooks/common/openshift-cluster/upgrades/docker/upgrade.yml
@@ -32,7 +32,13 @@
- debug: var=docker_image_count.stdout
when: docker_upgrade_nuke_images is defined and docker_upgrade_nuke_images | bool
-- service: name=docker state=stopped
+- service:
+ name: docker
+ state: stopped
+ register: l_pb_docker_upgrade_stop_result
+ until: not l_pb_docker_upgrade_stop_result | failed
+ retries: 3
+ delay: 30
- name: Upgrade Docker
package: name=docker{{ '-' + docker_version }} state=present
diff --git a/playbooks/common/openshift-cluster/upgrades/upgrade_control_plane.yml b/playbooks/common/openshift-cluster/upgrades/upgrade_control_plane.yml
index b7f089d99..2b2f10aee 100644
--- a/playbooks/common/openshift-cluster/upgrades/upgrade_control_plane.yml
+++ b/playbooks/common/openshift-cluster/upgrades/upgrade_control_plane.yml
@@ -296,6 +296,10 @@
command: >
{{ hostvars[groups.oo_first_master.0].openshift.common.admin_binary }} drain {{ openshift.node.nodename | lower }} --config={{ openshift.common.config_base }}/master/admin.kubeconfig --force --delete-local-data --ignore-daemonsets
delegate_to: "{{ groups.oo_first_master.0 }}"
+ register: l_upgrade_control_plane_drain_result
+ until: not l_upgrade_control_plane_drain_result | failed
+ retries: 60
+ delay: 60
roles:
- lib_openshift
diff --git a/playbooks/common/openshift-cluster/upgrades/upgrade_nodes.yml b/playbooks/common/openshift-cluster/upgrades/upgrade_nodes.yml
index 1d1e440d4..af15ec5b2 100644
--- a/playbooks/common/openshift-cluster/upgrades/upgrade_nodes.yml
+++ b/playbooks/common/openshift-cluster/upgrades/upgrade_nodes.yml
@@ -28,6 +28,10 @@
command: >
{{ hostvars[groups.oo_first_master.0].openshift.common.admin_binary }} drain {{ openshift.node.nodename | lower }} --force --delete-local-data --ignore-daemonsets
delegate_to: "{{ groups.oo_first_master.0 }}"
+ register: l_upgrade_nodes_drain_result
+ until: not l_upgrade_nodes_drain_result | failed
+ retries: 60
+ delay: 60
roles:
- lib_openshift
diff --git a/playbooks/common/openshift-node/restart.yml b/playbooks/common/openshift-node/restart.yml
index 63273cb78..ed2473a43 100644
--- a/playbooks/common/openshift-node/restart.yml
+++ b/playbooks/common/openshift-node/restart.yml
@@ -11,6 +11,10 @@
service:
name: docker
state: restarted
+ register: l_docker_restart_docker_in_node_result
+ until: not l_docker_restart_docker_in_node_result | failed
+ retries: 3
+ delay: 30
- name: Update docker facts
openshift_facts:
diff --git a/roles/calico/handlers/main.yml b/roles/calico/handlers/main.yml
index 53cecfcc3..67fc0065f 100644
--- a/roles/calico/handlers/main.yml
+++ b/roles/calico/handlers/main.yml
@@ -8,3 +8,7 @@
systemd:
name: "{{ openshift.docker.service_name }}"
state: restarted
+ register: l_docker_restart_docker_in_calico_result
+ until: not l_docker_restart_docker_in_calico_result | failed
+ retries: 3
+ delay: 30
diff --git a/roles/contiv/tasks/netplugin.yml b/roles/contiv/tasks/netplugin.yml
index 0847c92bc..e861a2591 100644
--- a/roles/contiv/tasks/netplugin.yml
+++ b/roles/contiv/tasks/netplugin.yml
@@ -108,6 +108,10 @@
name: "{{ openshift.docker.service_name }}"
state: restarted
when: docker_updated|changed
+ register: l_docker_restart_docker_in_contiv_result
+ until: not l_docker_restart_docker_in_contiv_result | failed
+ retries: 3
+ delay: 30
- name: Netplugin | Enable Netplugin
service:
diff --git a/roles/docker/handlers/main.yml b/roles/docker/handlers/main.yml
index 3a4f4ba92..591367467 100644
--- a/roles/docker/handlers/main.yml
+++ b/roles/docker/handlers/main.yml
@@ -6,9 +6,8 @@
state: restarted
register: r_docker_restart_docker_result
until: not r_docker_restart_docker_result | failed
- retries: 1
+ retries: 3
delay: 30
-
when: not docker_service_status_changed | default(false) | bool
- name: restart udev
diff --git a/roles/docker/tasks/package_docker.yml b/roles/docker/tasks/package_docker.yml
index c82d8659a..5f536005d 100644
--- a/roles/docker/tasks/package_docker.yml
+++ b/roles/docker/tasks/package_docker.yml
@@ -123,9 +123,12 @@
enabled: yes
state: started
daemon_reload: yes
- register: start_result
+ register: r_docker_package_docker_start_result
+ until: not r_docker_package_docker_start_result | failed
+ retries: 3
+ delay: 30
- set_fact:
- docker_service_status_changed: start_result | changed
+ docker_service_status_changed: "{{ r_docker_package_docker_start_result | changed }}"
- meta: flush_handlers
diff --git a/roles/docker/tasks/systemcontainer_docker.yml b/roles/docker/tasks/systemcontainer_docker.yml
index d8c5ccfd3..57a84bc2c 100644
--- a/roles/docker/tasks/systemcontainer_docker.yml
+++ b/roles/docker/tasks/systemcontainer_docker.yml
@@ -46,6 +46,11 @@
state: stopped
daemon_reload: yes
ignore_errors: True
+ register: r_docker_systemcontainer_docker_stop_result
+ until: not r_docker_systemcontainer_docker_stop_result | failed
+ retries: 3
+ delay: 30
+
# Set http_proxy, https_proxy, and no_proxy in /etc/atomic.conf
# regexp: the line starts with or without #, followed by the string
@@ -160,9 +165,12 @@
enabled: yes
state: started
daemon_reload: yes
- register: start_result
+ register: r_docker_systemcontainer_docker_start_result
+ until: not r_docker_systemcontainer_docker_start_result | failed
+ retries: 3
+ delay: 30
- set_fact:
- docker_service_status_changed: start_result | changed
+ docker_service_status_changed: "{{ r_docker_systemcontainer_docker_start_result | changed }}"
- meta: flush_handlers
diff --git a/roles/flannel/handlers/main.yml b/roles/flannel/handlers/main.yml
index c60c2115a..02f5a5f64 100644
--- a/roles/flannel/handlers/main.yml
+++ b/roles/flannel/handlers/main.yml
@@ -8,3 +8,7 @@
systemd:
name: "{{ openshift.docker.service_name }}"
state: restarted
+ register: l_docker_restart_docker_in_flannel_result
+ until: not l_docker_restart_docker_in_flannel_result | failed
+ retries: 3
+ delay: 30
diff --git a/roles/openshift_node/handlers/main.yml b/roles/openshift_node/handlers/main.yml
index a6bd12d4e..6b38da7f8 100644
--- a/roles/openshift_node/handlers/main.yml
+++ b/roles/openshift_node/handlers/main.yml
@@ -4,9 +4,14 @@
name: openvswitch
state: restarted
when: (not skip_node_svc_handlers | default(False) | bool) and not (ovs_service_status_changed | default(false) | bool) and openshift.common.use_openshift_sdn | bool
+ register: l_openshift_node_stop_openvswitch_result
+ until: not l_openshift_node_stop_openvswitch_result | failed
+ retries: 3
+ delay: 30
notify:
- restart openvswitch pause
+
- name: restart openvswitch pause
pause: seconds=15
when: (not skip_node_svc_handlers | default(False) | bool) and openshift.common.is_containerized | bool
@@ -15,7 +20,13 @@
systemd:
name: "{{ openshift.common.service_type }}-node"
state: restarted
- when: (not skip_node_svc_handlers | default(False) | bool) and not (node_service_status_changed | default(false) | bool)
+ register: l_openshift_node_restart_node_result
+ until: not l_openshift_node_restart_node_result | failed
+ retries: 3
+ delay: 30
+ when:
+ - (not skip_node_svc_handlers | default(False) | bool)
+ - not (node_service_status_changed | default(false) | bool)
- name: reload sysctl.conf
command: /sbin/sysctl -p
diff --git a/roles/openshift_node/tasks/main.yml b/roles/openshift_node/tasks/main.yml
index 573051504..879f6c207 100644
--- a/roles/openshift_node/tasks/main.yml
+++ b/roles/openshift_node/tasks/main.yml
@@ -118,8 +118,12 @@
name: openvswitch.service
enabled: yes
state: started
+ daemon_reload: yes
when: openshift.common.is_containerized | bool and openshift.common.use_openshift_sdn | bool
register: ovs_start_result
+ until: not ovs_start_result | failed
+ retries: 3
+ delay: 30
- set_fact:
ovs_service_status_changed: "{{ ovs_start_result | changed }}"
@@ -212,15 +216,27 @@
state: started
when: openshift.common.is_containerized | bool
+
- name: Start and enable node
systemd:
name: "{{ openshift.common.service_type }}-node"
enabled: yes
state: started
+ daemon_reload: yes
register: node_start_result
until: not node_start_result | failed
retries: 1
delay: 30
+ ignore_errors: true
+
+- name: Dump logs from node service if it failed
+ command: journalctl --no-pager -n 100 {{ openshift.common.service_type }}-node
+ when: node_start_result | failed
+
+- name: Abort if node failed to start
+ fail:
+ msg: Node failed to start please inspect the logs and try again
+ when: node_start_result | failed
- set_fact:
node_service_status_changed: "{{ node_start_result | changed }}"
diff --git a/roles/openshift_node_certificates/handlers/main.yml b/roles/openshift_node_certificates/handlers/main.yml
index 502f80434..4abe8bcaf 100644
--- a/roles/openshift_node_certificates/handlers/main.yml
+++ b/roles/openshift_node_certificates/handlers/main.yml
@@ -9,3 +9,7 @@
name: "{{ openshift.docker.service_name }}"
state: restarted
when: not openshift_certificates_redeploy | default(false) | bool
+ register: l_docker_restart_docker_in_cert_result
+ until: not l_docker_restart_docker_in_cert_result | failed
+ retries: 3
+ delay: 30
diff --git a/roles/openshift_node_upgrade/README.md b/roles/openshift_node_upgrade/README.md
index 8b388cc6a..4e6229bfb 100644
--- a/roles/openshift_node_upgrade/README.md
+++ b/roles/openshift_node_upgrade/README.md
@@ -84,6 +84,11 @@ Including an example of how to use your role (for instance, with variables passe
command: >
{{ hostvars[groups.oo_first_master.0].openshift.common.admin_binary }} drain {{ openshift.node.nodename | lower }} --force --delete-local-data --ignore-daemonsets
delegate_to: "{{ groups.oo_first_master.0 }}"
+ register: l_docker_upgrade_drain_result
+ until: not l_docker_upgrade_drain_result | failed
+ retries: 60
+ delay: 60
+
roles:
- openshift_facts
diff --git a/roles/openshift_node_upgrade/handlers/main.yml b/roles/openshift_node_upgrade/handlers/main.yml
index cb51416d4..110dfe5ce 100644
--- a/roles/openshift_node_upgrade/handlers/main.yml
+++ b/roles/openshift_node_upgrade/handlers/main.yml
@@ -1,7 +1,13 @@
---
- name: restart openvswitch
- systemd: name=openvswitch state=restarted
+ systemd:
+ name: openvswitch
+ state: restarted
when: (not skip_node_svc_handlers | default(False) | bool) and not (ovs_service_status_changed | default(false) | bool) and openshift.common.use_openshift_sdn | bool
+ register: l_openshift_node_upgrade_stop_openvswitch_result
+ until: not l_openshift_node_upgrade_stop_openvswitch_result | failed
+ retries: 3
+ delay: 30
notify:
- restart openvswitch pause
@@ -10,5 +16,13 @@
when: (not skip_node_svc_handlers | default(False) | bool) and openshift.common.is_containerized | bool
- name: restart node
- systemd: name={{ openshift.common.service_type }}-node state=restarted
- when: (not skip_node_svc_handlers | default(False) | bool) and not (node_service_status_changed | default(false) | bool)
+ systemd:
+ name: "{{ openshift.common.service_type }}-node"
+ state: restarted
+ register: l_openshift_node_upgrade_restart_node_result
+ until: not l_openshift_node_upgrade_restart_node_result | failed
+ retries: 3
+ delay: 30
+ when:
+ - (not skip_node_svc_handlers | default(False) | bool)
+ - not (node_service_status_changed | default(false) | bool)
diff --git a/roles/openshift_node_upgrade/tasks/docker/upgrade.yml b/roles/openshift_node_upgrade/tasks/docker/upgrade.yml
index 416cf605a..ebe87d6fd 100644
--- a/roles/openshift_node_upgrade/tasks/docker/upgrade.yml
+++ b/roles/openshift_node_upgrade/tasks/docker/upgrade.yml
@@ -26,7 +26,13 @@
- debug: var=docker_image_count.stdout
when: docker_upgrade_nuke_images is defined and docker_upgrade_nuke_images | bool
-- service: name=docker state=stopped
+- service:
+ name: docker
+ state: stopped
+ register: l_openshift_node_upgrade_docker_stop_result
+ until: not l_openshift_node_upgrade_docker_stop_result | failed
+ retries: 3
+ delay: 30
- name: Upgrade Docker
package: name=docker{{ '-' + docker_version }} state=present
diff --git a/roles/openshift_node_upgrade/tasks/restart.yml b/roles/openshift_node_upgrade/tasks/restart.yml
index 6947223af..f228b6e08 100644
--- a/roles/openshift_node_upgrade/tasks/restart.yml
+++ b/roles/openshift_node_upgrade/tasks/restart.yml
@@ -19,7 +19,7 @@
state: started
register: docker_start_result
until: not docker_start_result | failed
- retries: 1
+ retries: 3
delay: 30
- name: Update docker facts