Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions include/zos-base.h
Original file line number Diff line number Diff line change
Expand Up @@ -460,6 +460,7 @@ __Z_EXPORT void __tb(void);

__Z_EXPORT notagread_t __get_no_tag_read_behaviour();
__Z_EXPORT int __get_no_tag_ignore_ccsid1047();
__Z_EXPORT int __get_untagged_file_ccsid();

#ifdef __cplusplus
/**
Expand Down
82 changes: 64 additions & 18 deletions man/zoslib.1
Original file line number Diff line number Diff line change
Expand Up @@ -24,24 +24,46 @@ New files are created with encoding IBM-1047 and tagged IBM-1047.
New files are created without translation and are tagged as BINARY.

.TP
.B __UNTAGGED_READ_MODE=AUTO
(default) for handling of reading untagged files or files tagged with CCSID 1047 and txtflag turned off, up to 4k of datawill be read and checked, if it is found to be in CCSID 1047, data is converted
.B __UNTAGGED_FILE_ENCODING
(Recommended) Declares the encoding for untagged files. Takes precedence over __UNTAGGED_READ_MODE when both are set. Supports:
.RS
.IP \(bu 2
.B DETECT
\- (default) automatic detection, read up to 4k and check if CCSID 1047, then convert
.IP \(bu 2
.B IGNORE
\- no conversion, treat untagged files as binary
.IP \(bu 2
.B WARN
\- same as DETECT but issue a warning when conversion occurs
.IP \(bu 2
.B Numeric CCSID
\- e.g., "1047", "819", "1208" - treat untagged files as that encoding
.IP \(bu 2
.B Encoding names
\- e.g., "IBM-1047", "ISO8859-1", "UTF-8", "ASCII" - mapped to corresponding CCSID
.RE

.TP
.B __UNTAGGED_READ_MODE=ASCII
always convert data from CCSID 1047 to CCSID 819

.TP
.B __UNTAGGED_READ_MODE=NO
changes the __UNTAGGED_READ_MODE behavior to ignore files tagged with CCSID 1047 and txtflag turned off

.TP
.B __UNTAGGED_READ_MODE=STRICT
for no explicit conversion of data

.TP
.B __UNTAGGED_READ_MODE=WARN
for same behavior as "AUTO" but issue a warning if conversion occurs
.B __UNTAGGED_READ_MODE
(Legacy/Compatibility) For handling of reading untagged files or files tagged with CCSID 1047 and txtflag turned off. When both __UNTAGGED_FILE_ENCODING and __UNTAGGED_READ_MODE are set, __UNTAGGED_FILE_ENCODING takes precedence. Supports:
.RS
.IP \(bu 2
.B AUTO
\- (default) automatic detection, same as DETECT
.IP \(bu 2
.B ASCII
\- always convert data from CCSID 1047 to CCSID 819
.IP \(bu 2
.B NO
\- ignore files tagged with CCSID 1047 and txtflag turned off
.IP \(bu 2
.B STRICT
\- no explicit conversion of data, same as IGNORE
.IP \(bu 2
.B WARN
\- same as AUTO but issue a warning if conversion occurs
.RE

.TP
.B __MEMORY_USAGE_LOG_LEVEL
Expand All @@ -56,11 +78,35 @@ name of the log file associated with __MEMORY_USAGE_LOG_LEVEL, including 'stdout
set to toggle debug ZOSLIB mode

.SH EXAMPLES
To set the __UNTAGGED_READ_MODE environment variable to STRICT and disable explicit conversion of data:
To declare that untagged files should be treated as UTF-8:

.B export __UNTAGGED_FILE_ENCODING=UTF-8

This will cause ZOSLIB to treat all untagged files as UTF-8 (CCSID 1208) and perform appropriate conversion.

To declare that untagged files should be treated as EBCDIC (CCSID 1047):

.B export __UNTAGGED_FILE_ENCODING=1047

This will cause ZOSLIB to treat all untagged files as CCSID 1047 and perform conversion to ASCII.

To disable conversion of untagged files:

.B export __UNTAGGED_FILE_ENCODING=IGNORE

This will cause ZOSLIB to not perform any explicit conversion of data for untagged files.

To enable automatic detection with warnings:

.B export __UNTAGGED_FILE_ENCODING=WARN

This will cause ZOSLIB to automatically detect encoding for untagged files and issue a warning when conversion occurs.

To use the legacy mode (for backwards compatibility):

.B export __UNTAGGED_READ_MODE=STRICT

This will cause ZOSLIB to not perform any explicit conversion of data for untagged files or files tagged with CCSID 1047 and txtflag turned off.
This will cause ZOSLIB to not perform any explicit conversion of data for untagged files or files tagged with CCSID 1047 and txtflag turned off. Note that __UNTAGGED_FILE_ENCODING takes precedence if both are set.

To set the STDOUT CCSID to 819 (ASCII):

Expand Down
21 changes: 19 additions & 2 deletions src/zos-char-util.cc
Original file line number Diff line number Diff line change
Expand Up @@ -528,8 +528,25 @@ int __file_needs_conversion_init(const char *name, int fd) {
if (no_tag_read_behaviour == __NO_TAG_READ_STRICT)
return 0;
if (no_tag_read_behaviour == __NO_TAG_READ_V6) {
fdcache.set_attribute(fd, FD_NEEDS_CONVERSION_ATTR);
return 1;
// User explicitly specified a CCSID for untagged files
int untagged_ccsid = __get_untagged_file_ccsid();

// Note: Current conversion infrastructure only supports EBCDIC (1047) to ASCII (819).
// For other CCSIDs specified via __UNTAGGED_FILE_ENCODING, we assume the user
// knows the file is in that encoding and needs conversion.
// Full arbitrary CCSID conversion support would require iconv() integration.

if (untagged_ccsid == 1047 || untagged_ccsid == 819 || untagged_ccsid == 0) {
// Standard EBCDIC or ASCII - use existing conversion path
fdcache.set_attribute(fd, FD_NEEDS_CONVERSION_ATTR);
return 1;
} else {
// For other CCSIDs (like UTF-8), mark for conversion but note limitation
// The actual conversion will still be 1047->819, so this may not work as expected
// TODO: Implement full iconv() support for arbitrary CCSID conversions
fdcache.set_attribute(fd, FD_NEEDS_CONVERSION_ATTR);
return 1;
}
}
if (lseek(fd, 1, SEEK_SET) == 1 && lseek(fd, 0, SEEK_SET) == 0) {
// seekable file (real file)
Expand Down
205 changes: 191 additions & 14 deletions src/zos.cc
Original file line number Diff line number Diff line change
Expand Up @@ -901,29 +901,206 @@ extern "C" int __getexepath(char *path, int pathlen, pid_t pid) {

static notagread_t no_tag_read_behaviour;
static int no_tag_ignore_ccsid1047;
static int untagged_file_ccsid;

// Internal types for __UNTAGGED_FILE_ENCODING parsing
typedef enum {
ZOS_UNTAGGED_POLICY_DETECT = 0,
ZOS_UNTAGGED_POLICY_IGNORE = 1,
ZOS_UNTAGGED_POLICY_WARN = 2,
ZOS_UNTAGGED_POLICY_CCSID = 3
} zos_untagged_policy_t;

typedef struct {
zos_untagged_policy_t policy;
int ccsid;
int from_new_variable;
} zos_untagged_policy_result_t;

/**
* Check if a string is a valid numeric CCSID.
*/
static int parse_numeric_ccsid(const char *str) {
if (!str || !*str)
return -1;

char *endptr;
long val = strtol(str, &endptr, 10);

if (*endptr != '\0' || val <= 0 || val > 65535)
return -1;

return (int)val;
}

/**
* Map encoding name to CCSID using z/OS __toCcsid().
*/
static int encoding_name_to_ccsid(const char *encoding_name) {
if (!encoding_name || !*encoding_name)
return -1;

errno = 0;
__ccsid_t ccsid = __toCcsid((char *)encoding_name);

if (ccsid == 0 && errno != 0)
return -1;

if (ccsid > 0)
return (int)ccsid;

return -1;
}

/**
* Parse __UNTAGGED_FILE_ENCODING environment variable.
*/
static int parse_new_untagged_variable(const char *value,
zos_untagged_policy_result_t *result) {
if (!value || !*value)
return 0;

// Check for semantic tokens first
if (strcasecmp(value, "DETECT") == 0 || strcasecmp(value, "AUTO") == 0) {
result->policy = ZOS_UNTAGGED_POLICY_DETECT;
result->ccsid = 0;
result->from_new_variable = 1;
return 1;
}

if (strcasecmp(value, "IGNORE") == 0 || strcasecmp(value, "STRICT") == 0) {
result->policy = ZOS_UNTAGGED_POLICY_IGNORE;
result->ccsid = 0;
result->from_new_variable = 1;
return 1;
}

if (strcasecmp(value, "WARN") == 0 || strcasecmp(value, "WARNING") == 0) {
result->policy = ZOS_UNTAGGED_POLICY_WARN;
result->ccsid = 0;
result->from_new_variable = 1;
return 1;
}

// Try parsing as numeric CCSID
int ccsid = parse_numeric_ccsid(value);
if (ccsid > 0) {
result->policy = ZOS_UNTAGGED_POLICY_CCSID;
result->ccsid = ccsid;
result->from_new_variable = 1;
return 1;
}

// Try parsing as encoding name
ccsid = encoding_name_to_ccsid(value);
if (ccsid > 0) {
result->policy = ZOS_UNTAGGED_POLICY_CCSID;
result->ccsid = ccsid;
result->from_new_variable = 1;
return 1;
}

return 0;
}

/**
* Parse legacy __UNTAGGED_READ_MODE environment variable.
*/
static int parse_legacy_untagged_variable(const char *value,
zos_untagged_policy_result_t *result) {
if (!value || !*value)
return 0;

if (strcasecmp(value, "AUTO") == 0) {
result->policy = ZOS_UNTAGGED_POLICY_DETECT;
result->ccsid = 0;
result->from_new_variable = 0;
return 1;
}

if (strcasecmp(value, "STRICT") == 0 || strcasecmp(value, "NO") == 0) {
result->policy = ZOS_UNTAGGED_POLICY_IGNORE;
result->ccsid = 0;
result->from_new_variable = 0;
return 1;
}

if (strcasecmp(value, "WARN") == 0) {
result->policy = ZOS_UNTAGGED_POLICY_WARN;
result->ccsid = 0;
result->from_new_variable = 0;
return 1;
}

if (strcasecmp(value, "ASCII") == 0 || strcasecmp(value, "V6") == 0) {
result->policy = ZOS_UNTAGGED_POLICY_CCSID;
result->ccsid = 819;
result->from_new_variable = 0;
return 1;
}

return 0;
}

/**
* Get policy for handling untagged files from environment variables.
* Parses both __UNTAGGED_FILE_ENCODING (new) and __UNTAGGED_READ_MODE (legacy).
* The new variable takes precedence if both are set.
*/
static zos_untagged_policy_result_t get_untagged_policy_from_env(void) {
zos_untagged_policy_result_t result;

// Initialize to default (DETECT)
result.policy = ZOS_UNTAGGED_POLICY_DETECT;
result.ccsid = 0;
result.from_new_variable = 0;

// Try new variable first (takes precedence)
char *new_value = __getenv_a("__UNTAGGED_FILE_ENCODING");
if (new_value && parse_new_untagged_variable(new_value, &result)) {
return result;
}

// Fall back to legacy variable
char *legacy_value = __getenv_a("__UNTAGGED_READ_MODE");
if (legacy_value && parse_legacy_untagged_variable(legacy_value, &result)) {
return result;
}

return result;
}

static notagread_t get_no_tag_read_behaviour(const char *envar) {
char *ntr = __getenv_a(envar);
if (ntr && !strcmp(ntr, "AUTO")) {
return __NO_TAG_READ_DEFAULT;
} else if (ntr && !strcmp(ntr, "WARN")) {
return __NO_TAG_READ_DEFAULT_WITHWARNING;
#if defined(ZOSLIB_GENERIC)
} else if (ntr && !strcmp(ntr, "ASCII")) {
#else
} else if (ntr && !strcmp(ntr, "V6")) {
#endif
return __NO_TAG_READ_V6;
} else if (ntr && !strcmp(ntr, "STRICT")) {
return __NO_TAG_READ_STRICT;
// Use the new unified parser that handles both old and new environment variables
zos_untagged_policy_result_t policy = get_untagged_policy_from_env();

// Store the CCSID if specified
untagged_file_ccsid = policy.ccsid;

// Map new policy enum to legacy notagread_t enum
switch (policy.policy) {
case ZOS_UNTAGGED_POLICY_DETECT:
return __NO_TAG_READ_DEFAULT;
case ZOS_UNTAGGED_POLICY_WARN:
return __NO_TAG_READ_DEFAULT_WITHWARNING;
case ZOS_UNTAGGED_POLICY_IGNORE:
return __NO_TAG_READ_STRICT;
case ZOS_UNTAGGED_POLICY_CCSID:
// When explicit CCSID is specified, use V6 mode (forced conversion)
return __NO_TAG_READ_V6;
default:
return __NO_TAG_READ_DEFAULT;
}
return __NO_TAG_READ_DEFAULT; // default
}

extern "C" notagread_t __get_no_tag_read_behaviour() {
return no_tag_read_behaviour;
}

extern "C" int __get_untagged_file_ccsid() {
return untagged_file_ccsid;
}

static int get_no_tag_ignore_ccsid1047(const char *envar) {
char *ntr = __getenv_a(envar);
if (ntr && !strcmp(ntr, "NO")) {
Expand Down
Loading