Advanced Performance Optimization Techniques for PySpark Data Pipelines: Production-Ready Strategies
Building upon the fundamental performance tuning concepts covered in our previous blog post on Performance Tuning on …
Building upon our foundation in PySpark with Docker, this guide explores advanced container orchestration techniques essential for production data engineering environments. We’ll cover Kubernetes deployments, service mesh architectures, and automated scaling strategies for data-intensive applications.
1# Dockerfile.spark-app
2# Stage 1: Build environment
3FROM maven:3.8-openjdk-11-slim AS builder
4
5WORKDIR /app
6COPY pom.xml .
7RUN mvn dependency:go-offline -B
8
9COPY src ./src
10RUN mvn clean package -DskipTests
11
12# Stage 2: Runtime environment
13FROM apache/spark:3.5.0-scala2.12-java11-python3-ubuntu
14
15USER root
16
17# Install production dependencies
18RUN apt-get update && apt-get install -y \
19 curl \
20 wget \
21 procps \
22 && rm -rf /var/lib/apt/lists/*
23
24# Copy application JAR
25COPY --from=builder /app/target/spark-app-1.0.jar /opt/spark-apps/
26COPY --from=builder /app/target/lib /opt/spark-apps/lib/
27
28# Python dependencies for PySpark
29COPY requirements.txt /tmp/
30RUN pip install --no-cache-dir -r /tmp/requirements.txt
31
32# Application scripts
33COPY scripts/ /opt/spark-apps/scripts/
34COPY conf/ /opt/spark-apps/conf/
35
36# Health check script
37COPY healthcheck.py /opt/spark-apps/
38RUN chmod +x /opt/spark-apps/scripts/*.sh
39
40# Set working directory
41WORKDIR /opt/spark-apps
42
43# Health check
44HEALTHCHECK --interval=30s --timeout=10s --start-period=60s --retries=3 \
45 CMD python healthcheck.py
46
47EXPOSE 4040 7077 8080 8081
48
49ENTRYPOINT ["/opt/spark/bin/spark-submit"]
1# spark-namespace.yaml
2apiVersion: v1
3kind: Namespace
4metadata:
5 name: data-engineering
6 labels:
7 name: data-engineering
8
9---
10# spark-service-account.yaml
11apiVersion: v1
12kind: ServiceAccount
13metadata:
14 name: spark-operator
15 namespace: data-engineering
16
17---
18apiVersion: rbac.authorization.k8s.io/v1
19kind: ClusterRole
20metadata:
21 name: spark-operator-role
22rules:
23- apiGroups: [""]
24 resources: ["pods", "services", "configmaps"]
25 verbs: ["create", "get", "list", "watch", "update", "patch", "delete"]
26- apiGroups: ["apps"]
27 resources: ["deployments"]
28 verbs: ["create", "get", "list", "watch", "update", "patch", "delete"]
29
30---
31apiVersion: rbac.authorization.k8s.io/v1
32kind: ClusterRoleBinding
33metadata:
34 name: spark-operator-binding
35roleRef:
36 apiGroup: rbac.authorization.k8s.io
37 kind: ClusterRole
38 name: spark-operator-role
39subjects:
40- kind: ServiceAccount
41 name: spark-operator
42 namespace: data-engineering
43
44---
45# spark-configmap.yaml
46apiVersion: v1
47kind: ConfigMap
48metadata:
49 name: spark-config
50 namespace: data-engineering
51data:
52 spark-defaults.conf: |
53 spark.sql.adaptive.enabled=true
54 spark.sql.adaptive.coalescePartitions.enabled=true
55 spark.serializer=org.apache.spark.serializer.KryoSerializer
56 spark.kubernetes.executor.deleteOnTermination=true
57 spark.kubernetes.executor.podNamePrefix=spark-exec
58 spark.kubernetes.container.image.pullPolicy=Always
59 spark.executor.memory=2g
60 spark.executor.cores=2
61 spark.driver.memory=1g
62 spark.dynamicAllocation.enabled=true
63 spark.dynamicAllocation.minExecutors=1
64 spark.dynamicAllocation.maxExecutors=10
65 log4j.properties: |
66 log4j.rootLogger=INFO, console
67 log4j.appender.console=org.apache.log4j.ConsoleAppender
68 log4j.appender.console.target=System.out
69 log4j.appender.console.layout=org.apache.log4j.PatternLayout
70 log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n
71
72---
73# spark-deployment.yaml
74apiVersion: apps/v1
75kind: Deployment
76metadata:
77 name: spark-master
78 namespace: data-engineering
79 labels:
80 app: spark-master
81spec:
82 replicas: 1
83 selector:
84 matchLabels:
85 app: spark-master
86 template:
87 metadata:
88 labels:
89 app: spark-master
90 spec:
91 serviceAccountName: spark-operator
92 containers:
93 - name: spark-master
94 image: spark-data-app:latest
95 imagePullPolicy: Always
96 ports:
97 - containerPort: 7077
98 name: spark-master
99 - containerPort: 8080
100 name: web-ui
101 env:
102 - name: SPARK_MODE
103 value: "master"
104 - name: SPARK_MASTER_HOST
105 value: "0.0.0.0"
106 - name: SPARK_MASTER_PORT
107 value: "7077"
108 - name: SPARK_MASTER_WEBUI_PORT
109 value: "8080"
110 volumeMounts:
111 - name: spark-config
112 mountPath: /opt/spark/conf
113 resources:
114 requests:
115 memory: "1Gi"
116 cpu: "500m"
117 limits:
118 memory: "2Gi"
119 cpu: "1"
120 livenessProbe:
121 httpGet:
122 path: /
123 port: 8080
124 initialDelaySeconds: 60
125 periodSeconds: 30
126 readinessProbe:
127 httpGet:
128 path: /
129 port: 8080
130 initialDelaySeconds: 30
131 periodSeconds: 10
132 volumes:
133 - name: spark-config
134 configMap:
135 name: spark-config
136
137---
138# spark-worker-deployment.yaml
139apiVersion: apps/v1
140kind: Deployment
141metadata:
142 name: spark-worker
143 namespace: data-engineering
144 labels:
145 app: spark-worker
146spec:
147 replicas: 3
148 selector:
149 matchLabels:
150 app: spark-worker
151 template:
152 metadata:
153 labels:
154 app: spark-worker
155 spec:
156 serviceAccountName: spark-operator
157 containers:
158 - name: spark-worker
159 image: spark-data-app:latest
160 imagePullPolicy: Always
161 ports:
162 - containerPort: 8081
163 name: web-ui
164 env:
165 - name: SPARK_MODE
166 value: "worker"
167 - name: SPARK_MASTER_URL
168 value: "spark://spark-master-service:7077"
169 - name: SPARK_WORKER_MEMORY
170 value: "2g"
171 - name: SPARK_WORKER_CORES
172 value: "2"
173 volumeMounts:
174 - name: spark-config
175 mountPath: /opt/spark/conf
176 resources:
177 requests:
178 memory: "2Gi"
179 cpu: "1"
180 limits:
181 memory: "4Gi"
182 cpu: "2"
183 livenessProbe:
184 httpGet:
185 path: /
186 port: 8081
187 initialDelaySeconds: 60
188 periodSeconds: 30
189 volumes:
190 - name: spark-config
191 configMap:
192 name: spark-config
193
194---
195# spark-services.yaml
196apiVersion: v1
197kind: Service
198metadata:
199 name: spark-master-service
200 namespace: data-engineering
201 labels:
202 app: spark-master
203spec:
204 selector:
205 app: spark-master
206 ports:
207 - port: 7077
208 targetPort: 7077
209 name: spark-master
210 - port: 8080
211 targetPort: 8080
212 name: web-ui
213 type: LoadBalancer
214
215---
216apiVersion: v1
217kind: Service
218metadata:
219 name: spark-worker-service
220 namespace: data-engineering
221 labels:
222 app: spark-worker
223spec:
224 selector:
225 app: spark-worker
226 ports:
227 - port: 8081
228 targetPort: 8081
229 name: web-ui
230 clusterIP: None # Headless service for worker discovery
1# Chart.yaml
2apiVersion: v2
3name: data-engineering-platform
4description: A Helm chart for data engineering applications
5version: 1.0.0
6appVersion: "3.5.0"
7
8dependencies:
9- name: kafka
10 version: "0.21.0"
11 repository: "https://charts.bitnami.com/bitnami"
12 condition: kafka.enabled
13- name: redis
14 version: "17.0.0"
15 repository: "https://charts.bitnami.com/bitnami"
16 condition: redis.enabled
17
18---
19# values.yaml
20replicaCount: 3
21
22image:
23 repository: spark-data-app
24 pullPolicy: Always
25 tag: "latest"
26
27service:
28 type: LoadBalancer
29 port: 80
30
31ingress:
32 enabled: true
33 className: "nginx"
34 annotations:
35 cert-manager.io/cluster-issuer: "letsencrypt-prod"
36 nginx.ingress.kubernetes.io/ssl-redirect: "true"
37 hosts:
38 - host: data-platform.example.com
39 paths:
40 - path: /
41 pathType: Prefix
42 tls:
43 - secretName: data-platform-tls
44 hosts:
45 - data-platform.example.com
46
47autoscaling:
48 enabled: true
49 minReplicas: 3
50 maxReplicas: 20
51 targetCPUUtilizationPercentage: 70
52 targetMemoryUtilizationPercentage: 80
53
54resources:
55 limits:
56 cpu: 2000m
57 memory: 4Gi
58 requests:
59 cpu: 1000m
60 memory: 2Gi
61
62kafka:
63 enabled: true
64 replicaCount: 3
65 auth:
66 enabled: false
67
68redis:
69 enabled: true
70 auth:
71 enabled: false
72 master:
73 resources:
74 limits:
75 cpu: 250m
76 memory: 256Mi
77
78monitoring:
79 enabled: true
80 serviceMonitor:
81 enabled: true
82 interval: 30s
83
84---
85# templates/deployment.yaml
86apiVersion: apps/v1
87kind: Deployment
88metadata:
89 name: {{ include "data-engineering-platform.fullname" . }}
90 labels:
91 {{- include "data-engineering-platform.labels" . | nindent 4 }}
92spec:
93 {{- if not .Values.autoscaling.enabled }}
94 replicas: {{ .Values.replicaCount }}
95 {{- end }}
96 selector:
97 matchLabels:
98 {{- include "data-engineering-platform.selectorLabels" . | nindent 6 }}
99 template:
100 metadata:
101 annotations:
102 checksum/config: {{ include (print $.Template.BasePath "/configmap.yaml") . | sha256sum }}
103 labels:
104 {{- include "data-engineering-platform.selectorLabels" . | nindent 8 }}
105 spec:
106 containers:
107 - name: {{ .Chart.Name }}
108 image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}"
109 imagePullPolicy: {{ .Values.image.pullPolicy }}
110 ports:
111 - name: http
112 containerPort: 8080
113 protocol: TCP
114 - name: spark-ui
115 containerPort: 4040
116 protocol: TCP
117 env:
118 - name: KAFKA_BOOTSTRAP_SERVERS
119 value: "{{ include "data-engineering-platform.fullname" . }}-kafka:9092"
120 - name: REDIS_URL
121 value: "{{ include "data-engineering-platform.fullname" . }}-redis-master:6379"
122 livenessProbe:
123 httpGet:
124 path: /health
125 port: http
126 initialDelaySeconds: 120
127 periodSeconds: 30
128 readinessProbe:
129 httpGet:
130 path: /ready
131 port: http
132 initialDelaySeconds: 60
133 periodSeconds: 10
134 resources:
135 {{- toYaml .Values.resources | nindent 12 }}
136 volumeMounts:
137 - name: config
138 mountPath: /opt/spark/conf
139 volumes:
140 - name: config
141 configMap:
142 name: {{ include "data-engineering-platform.fullname" . }}-config
1# istio-gateway.yaml
2apiVersion: networking.istio.io/v1alpha3
3kind: Gateway
4metadata:
5 name: data-platform-gateway
6 namespace: data-engineering
7spec:
8 selector:
9 istio: ingressgateway
10 servers:
11 - port:
12 number: 443
13 name: https
14 protocol: HTTPS
15 tls:
16 mode: SIMPLE
17 credentialName: data-platform-cred
18 hosts:
19 - data-platform.example.com
20
21---
22# istio-virtual-service.yaml
23apiVersion: networking.istio.io/v1alpha3
24kind: VirtualService
25metadata:
26 name: data-platform-vs
27 namespace: data-engineering
28spec:
29 hosts:
30 - data-platform.example.com
31 gateways:
32 - data-platform-gateway
33 http:
34 - match:
35 - uri:
36 prefix: /api/v1/
37 route:
38 - destination:
39 host: spark-master-service
40 port:
41 number: 8080
42 fault:
43 delay:
44 percentage:
45 value: 0.1
46 fixedDelay: 5s
47 retries:
48 attempts: 3
49 perTryTimeout: 30s
50 - match:
51 - uri:
52 prefix: /spark-ui/
53 route:
54 - destination:
55 host: spark-master-service
56 port:
57 number: 4040
58 timeout: 300s
59
60---
61# istio-destination-rule.yaml
62apiVersion: networking.istio.io/v1alpha3
63kind: DestinationRule
64metadata:
65 name: spark-services-dr
66 namespace: data-engineering
67spec:
68 host: spark-master-service
69 trafficPolicy:
70 connectionPool:
71 tcp:
72 maxConnections: 100
73 http:
74 http1MaxPendingRequests: 50
75 maxRequestsPerConnection: 10
76 loadBalancer:
77 simple: LEAST_CONN
78 circuitBreaker:
79 consecutiveErrors: 3
80 interval: 30s
81 baseEjectionTime: 30s
82 maxEjectionPercent: 50
This Docker orchestration guide provides production-ready patterns for deploying data engineering applications at scale. The Kubernetes and Helm configurations shown enable robust, auto-scaling data pipelines with proper service mesh integration.
For foundational Docker concepts with PySpark, see our PySpark Docker tutorial. For advanced deployment strategies, explore our related guides on Apache Spark performance optimization.