Skip to content

Commit 9b7ab21

Browse files
ueshindongjoon-hyun
authored andcommitted
[SPARK-53975][PYTHON] Adds basic Python worker logging support
### What changes were proposed in this pull request? Adds basic Python worker logging support. The logs from Python's standard logger or `print` to `stdout` and `stderr` will be in the `system.session.python_worker_logs` view. * `spark.sql.pyspark.worker.logging.enabled` (`False` by default) When set to true, this configuration enables comprehensive logging within Python worker processes that execute User-Defined Functions (UDFs), User-Defined Table Functions (UDTFs), and other Python-based operations in Spark SQL. For example: ```py >>> from pyspark.sql.functions import * >>> import logging >>> >>> udf ... def logging_test_udf(x): ... logger = logging.getLogger("test") ... logger.setLevel(logging.INFO) ... logger.info(f"INFO level message: {x}") ... print(f"PRINT(STDOUT): {x}") # INFO level, logger is "stdout" ... print(f"PRINT(STDERR): {x}", file=sys.stderr) # ERROR level, logger is "stderr" ... try: ... 1 / x ... except: ... logger.exception(f"1 / {x}") ... return str(x) ... >>> spark.conf.set("spark.sql.pyspark.worker.logging.enabled", True) >>> >>> spark.range(2).select(logging_test_udf("id")).show() +-----+ |f(id)| +-----+ | 0| | 1| +-----+ >>> spark.table("system.session.python_worker_logs").orderBy("ts").show(truncate=False) +--------------------------+-----+---------------------+-------------------------------+-----------------------------------------------------------------------------+------+ |ts |level|msg |context |exception |logger| +--------------------------+-----+---------------------+-------------------------------+-----------------------------------------------------------------------------+------+ |2025-10-21 17:22:01.862654|INFO |INFO level message: 0|{func_name -> logging_test_udf}|NULL |test | |2025-10-21 17:22:01.863826|INFO |INFO level message: 1|{func_name -> logging_test_udf}|NULL |test | |2025-10-21 17:22:01.86505 |INFO |PRINT(STDOUT): 0 |{func_name -> logging_test_udf}|NULL |stdout| |2025-10-21 17:22:01.865827|INFO |PRINT(STDOUT): 1 |{func_name -> logging_test_udf}|NULL |stdout| |2025-10-21 17:22:01.87052 |ERROR|PRINT(STDERR): 0 |{func_name -> logging_test_udf}|NULL |stderr| |2025-10-21 17:22:01.871405|ERROR|PRINT(STDERR): 1 |{func_name -> logging_test_udf}|NULL |stderr| |2025-10-21 17:22:01.87188 |ERROR|1 / 0 |{func_name -> logging_test_udf}|{ZeroDivisionError, division by zero, [{NULL, logging_test_udf, <stdin>, 8}]}|test | +--------------------------+-----+---------------------+-------------------------------+-----------------------------------------------------------------------------+------+ ``` ### Why are the changes needed? The logging in UDF is difficult to collect the logs as they will go to the executor's `stderr` file. If there are many executors, need to check the `stderr` files one-by-one. ### Does this PR introduce _any_ user-facing change? Yes, Python UDF logging is available and collect them via a system view. ### How was this patch tested? Added the related tests. ### Was this patch authored or co-authored using generative AI tooling? No. Closes #52689 from ueshin/issues/SPARK-53975/basic_logging. Authored-by: Takuya Ueshin <[email protected]> Signed-off-by: Dongjoon Hyun <[email protected]>
1 parent 6892875 commit 9b7ab21

File tree

24 files changed

+1009
-27
lines changed

24 files changed

+1009
-27
lines changed

core/src/main/scala/org/apache/spark/api/python/PythonWorkerFactory.scala

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -395,12 +395,16 @@ private[spark] class PythonWorkerFactory(
395395
}
396396
}
397397

398+
private val workerLogCapture =
399+
envVars.get("PYSPARK_SPARK_SESSION_UUID").map(new PythonWorkerLogCapture(_))
400+
398401
/**
399402
* Redirect the given streams to our stderr in separate threads.
400403
*/
401404
private def redirectStreamsToStderr(stdout: InputStream, stderr: InputStream): Unit = {
402405
try {
403-
new RedirectThread(stdout, System.err, "stdout reader for " + pythonExec).start()
406+
new RedirectThread(workerLogCapture.map(_.wrapInputStream(stdout)).getOrElse(stdout),
407+
System.err, "stdout reader for " + pythonExec).start()
404408
new RedirectThread(stderr, System.err, "stderr reader for " + pythonExec).start()
405409
} catch {
406410
case e: Exception =>
@@ -460,6 +464,7 @@ private[spark] class PythonWorkerFactory(
460464
}
461465

462466
def stop(): Unit = {
467+
workerLogCapture.foreach(_.closeAllWriters())
463468
stopDaemon()
464469
}
465470

Lines changed: 193 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,193 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
18+
package org.apache.spark.api.python
19+
20+
import java.io.{BufferedReader, InputStream, InputStreamReader}
21+
import java.nio.ByteBuffer
22+
import java.nio.charset.StandardCharsets
23+
import java.util.concurrent.ConcurrentHashMap
24+
import java.util.concurrent.atomic.AtomicLong
25+
26+
import scala.jdk.CollectionConverters._
27+
28+
import org.apache.spark.SparkEnv
29+
import org.apache.spark.internal.Logging
30+
import org.apache.spark.storage.{PythonWorkerLogBlockIdGenerator, PythonWorkerLogLine, RollingLogWriter}
31+
32+
/**
33+
* Manages Python UDF log capture and routing to per-worker log writers.
34+
*
35+
* This class handles the parsing of Python worker output streams and routes
36+
* log messages to appropriate rolling log writers based on worker PIDs.
37+
* Works for both daemon and non-daemon modes.
38+
*/
39+
private[python] class PythonWorkerLogCapture(
40+
sessionId: String,
41+
logMarker: String = "PYTHON_WORKER_LOGGING") extends Logging {
42+
43+
// Map to track per-worker log writers: workerId(PID) -> (writer, sequenceId)
44+
private val workerLogWriters = new ConcurrentHashMap[String, (RollingLogWriter, AtomicLong)]()
45+
46+
/**
47+
* Creates an InputStream wrapper that captures Python UDF logs from the given stream.
48+
*
49+
* @param inputStream The input stream to wrap (typically daemon stdout or worker stdout)
50+
* @return A wrapped InputStream that captures and routes log messages
51+
*/
52+
def wrapInputStream(inputStream: InputStream): InputStream = {
53+
new CaptureWorkerLogsInputStream(inputStream)
54+
}
55+
56+
/**
57+
* Removes and closes the log writer for a specific worker.
58+
*
59+
* @param workerId The worker ID (typically PID as string)
60+
*/
61+
def removeAndCloseWorkerLogWriter(workerId: String): Unit = {
62+
Option(workerLogWriters.remove(workerId)).foreach { case (writer, _) =>
63+
try {
64+
writer.close()
65+
} catch {
66+
case e: Exception =>
67+
logWarning(s"Failed to close log writer for worker $workerId", e)
68+
}
69+
}
70+
}
71+
72+
/**
73+
* Closes all active worker log writers.
74+
*/
75+
def closeAllWriters(): Unit = {
76+
workerLogWriters.values().asScala.foreach { case (writer, _) =>
77+
try {
78+
writer.close()
79+
} catch {
80+
case e: Exception =>
81+
logWarning("Failed to close log writer", e)
82+
}
83+
}
84+
workerLogWriters.clear()
85+
}
86+
87+
/**
88+
* Gets or creates a log writer for the specified worker.
89+
*
90+
* @param workerId Unique identifier for the worker (typically PID)
91+
* @return Tuple of (RollingLogWriter, AtomicLong sequence counter)
92+
*/
93+
private def getOrCreateLogWriter(workerId: String): (RollingLogWriter, AtomicLong) = {
94+
workerLogWriters.computeIfAbsent(workerId, _ => {
95+
val logWriter = SparkEnv.get.blockManager.getRollingLogWriter(
96+
new PythonWorkerLogBlockIdGenerator(sessionId, workerId)
97+
)
98+
(logWriter, new AtomicLong())
99+
})
100+
}
101+
102+
/**
103+
* Processes a log line from a Python worker.
104+
*
105+
* @param line The complete line containing the log marker and JSON
106+
* @return The prefix (non-log content) that should be passed through
107+
*/
108+
private def processLogLine(line: String): String = {
109+
val markerIndex = line.indexOf(s"$logMarker:")
110+
if (markerIndex >= 0) {
111+
val prefix = line.substring(0, markerIndex)
112+
val markerAndJson = line.substring(markerIndex)
113+
114+
// Parse: "PYTHON_UDF_LOGGING:12345:{json}"
115+
val parts = markerAndJson.split(":", 3)
116+
if (parts.length >= 3) {
117+
val workerId = parts(1) // This is the PID from Python worker
118+
val json = parts(2)
119+
120+
try {
121+
if (json.isEmpty) {
122+
removeAndCloseWorkerLogWriter(workerId)
123+
} else {
124+
val (writer, seqId) = getOrCreateLogWriter(workerId)
125+
writer.writeLog(
126+
PythonWorkerLogLine(System.currentTimeMillis(), seqId.getAndIncrement(), json)
127+
)
128+
}
129+
} catch {
130+
case e: Exception =>
131+
logWarning(s"Failed to write log for worker $workerId", e)
132+
}
133+
}
134+
prefix
135+
} else {
136+
line + System.lineSeparator()
137+
}
138+
}
139+
140+
/**
141+
* InputStream wrapper that captures and processes Python UDF logs.
142+
*/
143+
private class CaptureWorkerLogsInputStream(in: InputStream) extends InputStream {
144+
145+
private[this] val reader = new BufferedReader(
146+
new InputStreamReader(in, StandardCharsets.ISO_8859_1))
147+
private[this] val temp = new Array[Byte](1)
148+
private[this] var buffer = ByteBuffer.allocate(0)
149+
150+
override def read(): Int = {
151+
val n = read(temp)
152+
if (n <= 0) {
153+
-1
154+
} else {
155+
// Signed byte to unsigned integer
156+
temp(0) & 0xff
157+
}
158+
}
159+
160+
override def read(b: Array[Byte], off: Int, len: Int): Int = {
161+
if (buffer.hasRemaining) {
162+
val buf = ByteBuffer.wrap(b, off, len)
163+
val remaining = Math.min(buffer.remaining(), buf.remaining())
164+
buf.put(buf.position(), buffer, buffer.position(), remaining)
165+
buffer.position(buffer.position() + remaining)
166+
remaining
167+
} else {
168+
val line = reader.readLine()
169+
if (line == null) {
170+
closeAllWriters()
171+
-1
172+
} else {
173+
val processedContent = if (line.contains(s"$logMarker:")) {
174+
processLogLine(line)
175+
} else {
176+
line + System.lineSeparator()
177+
}
178+
179+
buffer = ByteBuffer.wrap(processedContent.getBytes(StandardCharsets.ISO_8859_1))
180+
read(b, off, len)
181+
}
182+
}
183+
}
184+
185+
override def close(): Unit = {
186+
try {
187+
reader.close()
188+
} finally {
189+
closeAllWriters()
190+
}
191+
}
192+
}
193+
}

core/src/main/scala/org/apache/spark/storage/BlockId.scala

Lines changed: 33 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -179,6 +179,7 @@ case class PythonStreamBlockId(streamId: Int, uniqueId: Long) extends BlockId {
179179
object LogBlockType extends Enumeration {
180180
type LogBlockType = Value
181181
val TEST = Value
182+
val PYTHON_WORKER = Value
182183
}
183184

184185
/**
@@ -188,30 +189,54 @@ object LogBlockType extends Enumeration {
188189
* and log management.
189190
* @param executorId the ID of the executor that produced this log block.
190191
*/
191-
abstract sealed class LogBlockId(
192-
val lastLogTime: Long,
193-
val executorId: String) extends BlockId {
192+
abstract sealed class LogBlockId extends BlockId {
193+
def lastLogTime: Long
194+
def executorId: String
194195
def logBlockType: LogBlockType
195196
}
196197

197198
object LogBlockId {
198199
def empty(logBlockType: LogBlockType): LogBlockId = {
199200
logBlockType match {
200201
case LogBlockType.TEST => TestLogBlockId(0L, "")
202+
case LogBlockType.PYTHON_WORKER => PythonWorkerLogBlockId(0L, "", "", "")
201203
case _ => throw new SparkException(s"Unsupported log block type: $logBlockType")
202204
}
203205
}
204206
}
205207

206208
// Used for test purpose only.
207-
case class TestLogBlockId(override val lastLogTime: Long, override val executorId: String)
208-
extends LogBlockId(lastLogTime, executorId) {
209+
case class TestLogBlockId(lastLogTime: Long, executorId: String)
210+
extends LogBlockId {
209211
override def name: String =
210212
"test_log_" + lastLogTime + "_" + executorId
211213

212214
override def logBlockType: LogBlockType = LogBlockType.TEST
213215
}
214216

217+
/**
218+
* Identifies a block of Python worker log data.
219+
*
220+
* @param lastLogTime the timestamp of the last log entry in this block, used for filtering
221+
* and log management.
222+
* @param executorId the ID of the executor that produced this log block.
223+
* @param sessionId the session ID to isolate the logs.
224+
* @param workerId the worker ID to distinguish the Python worker process.
225+
*/
226+
@DeveloperApi
227+
case class PythonWorkerLogBlockId(
228+
lastLogTime: Long,
229+
executorId: String,
230+
sessionId: String,
231+
workerId: String)
232+
extends LogBlockId {
233+
override def name: String = {
234+
s"python_worker_log_${lastLogTime}_${executorId}_${sessionId}_$workerId"
235+
}
236+
237+
override def logBlockType: LogBlockType = LogBlockType.PYTHON_WORKER
238+
}
239+
215240
/** Id associated with temporary local data managed as blocks. Not serializable. */
216241
private[spark] case class TempLocalBlockId(id: UUID) extends BlockId {
217242
override def name: String = "temp_local_" + id
@@ -260,6 +285,7 @@ object BlockId {
260285
val TEMP_SHUFFLE = "temp_shuffle_([-A-Fa-f0-9]+)".r
261286
val TEST = "test_(.*)".r
262287
val TEST_LOG_BLOCK = "test_log_([0-9]+)_(.*)".r
288+
val PYTHON_WORKER_LOG_BLOCK = "python_worker_log_([0-9]+)_([^_]*)_([^_]*)_([^_]*)".r
263289

264290
def apply(name: String): BlockId = name match {
265291
case RDD(rddId, splitIndex) =>
@@ -302,6 +328,8 @@ object BlockId {
302328
TempShuffleBlockId(UUID.fromString(uuid))
303329
case TEST_LOG_BLOCK(lastLogTime, executorId) =>
304330
TestLogBlockId(lastLogTime.toLong, executorId)
331+
case PYTHON_WORKER_LOG_BLOCK(lastLogTime, executorId, sessionId, workerId) =>
332+
PythonWorkerLogBlockId(lastLogTime.toLong, executorId, sessionId, workerId)
305333
case TEST(value) =>
306334
TestBlockId(value)
307335
case _ => throw SparkCoreErrors.unrecognizedBlockIdError(name)

core/src/main/scala/org/apache/spark/storage/LogBlockIdGenerator.scala

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,3 +47,14 @@ trait LogBlockIdGenerator {
4747
blockId
4848
}
4949
}
50+
51+
class PythonWorkerLogBlockIdGenerator(
52+
sessionId: String,
53+
workerId: String)
54+
extends LogBlockIdGenerator {
55+
56+
override def logBlockType: LogBlockType = LogBlockType.PYTHON_WORKER
57+
58+
override protected def genUniqueBlockId(lastLogTime: Long, executorId: String): LogBlockId =
59+
PythonWorkerLogBlockId(lastLogTime, executorId, sessionId, workerId)
60+
}

core/src/main/scala/org/apache/spark/storage/LogLine.scala

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,8 @@ object LogLine {
3838
logBlockType match {
3939
case LogBlockType.TEST =>
4040
classTag[TestLogLine]
41+
case LogBlockType.PYTHON_WORKER =>
42+
classTag[PythonWorkerLogLine]
4143
case unsupportedLogBlockType =>
4244
throw new RuntimeException("Not supported log type " + unsupportedLogBlockType)
4345
}
@@ -46,3 +48,6 @@ object LogLine {
4648
case class TestLogLine(eventTime: Long, sequenceId: Long, message: String)
4749
extends LogLine {
4850
}
51+
52+
case class PythonWorkerLogLine(eventTime: Long, sequenceId: Long, message: String)
53+
extends LogLine

0 commit comments

Comments
 (0)