Skip to content

Commit 289698a

Browse files
committed
Merge branch 'develop' into v4.0.0-rc-dev
2 parents b6cd4cd + 66a4b30 commit 289698a

File tree

3 files changed

+240
-7
lines changed

3 files changed

+240
-7
lines changed

synapseutils/sync.py

Lines changed: 24 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,8 @@
6060
ARRAY_BRACKET_PATTERN = re.compile(r"^\[.*\]$")
6161
SINGLE_OPEN_BRACKET_PATTERN = re.compile(r"^\[")
6262
SINGLE_CLOSING_BRACKET_PATTERN = re.compile(r"\]$")
63+
# https://stackoverflow.com/questions/18893390/splitting-on-comma-outside-quotes
64+
COMMAS_OUTSIDE_DOUBLE_QUOTES_PATTERN = re.compile(r",(?=(?:[^\"]*\"[^\"]*\")*[^\"]*$)")
6365

6466
tracer = trace.get_tracer("synapseclient")
6567

@@ -968,6 +970,13 @@ def _write_manifest_data(
968970
Write a number of keys and a list of data to a manifest file. This will write
969971
the data out as a tab separated file.
970972
973+
For the data we are writing to the TSV file we are not quoting the content with any
974+
characters. This is because the syncToSynapse function does not require strings to
975+
be quoted. When quote characters were included extra double quotes were being added
976+
to the strings when they were written to the manifest file. This was not causing
977+
errors, however, it was changing the content of the manifest file when changes
978+
were not required.
979+
971980
Args:
972981
filename: The name of the file to write to.
973982
keys: The keys of the manifest.
@@ -976,7 +985,13 @@ def _write_manifest_data(
976985
"""
977986
with io.open(filename, "w", encoding="utf8") if filename else sys.stdout as fp:
978987
csv_writer = csv.DictWriter(
979-
fp, keys, restval="", extrasaction="ignore", delimiter="\t"
988+
fp,
989+
keys,
990+
restval="",
991+
extrasaction="ignore",
992+
delimiter="\t",
993+
quotechar=None,
994+
quoting=csv.QUOTE_NONE,
980995
)
981996
csv_writer.writeheader()
982997
for row in data:
@@ -1215,21 +1230,23 @@ def syncToSynapse(
12151230

12161231
def _split_string(input_string: str) -> typing.List[str]:
12171232
"""
1218-
Use StringIO to create a file-like object for the csv.reader.
1233+
Use regex to split a string apart by commas that are not inside of double quotes.
12191234
1220-
Extract the first row (there should only be one row)
12211235
12221236
Args:
12231237
input_string: A string to split apart.
12241238
12251239
Returns:
12261240
The list of split items as strings.
12271241
"""
1228-
csv_reader = csv.reader(io.StringIO(input_string))
1242+
row = COMMAS_OUTSIDE_DOUBLE_QUOTES_PATTERN.split(input_string)
12291243

1230-
row = next(csv_reader)
1244+
modified_row = []
1245+
for item in row:
1246+
modified_item = item.strip()
1247+
modified_row.append(modified_item)
12311248

1232-
return row
1249+
return modified_row
12331250

12341251

12351252
def _convert_cell_in_manifest_to_python_types(
@@ -1247,6 +1264,7 @@ def _convert_cell_in_manifest_to_python_types(
12471264
all that is present.
12481265
"""
12491266
values_to_return = []
1267+
cell = cell.strip()
12501268

12511269
if ARRAY_BRACKET_PATTERN.match(string=cell):
12521270
# Replace the first '[' with an empty string

tests/integration/synapseutils/test_synapseutils_sync.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ def __init__(self):
3737

3838
self.header = "path parent used executed activityName synapseStore foo date_1 datetime_1 datetime_2 datetime_3 multiple_strings multiple_dates multiple_bools multiple_ints multiple_floats annotation_with_escaped_commas\n"
3939
self.row1 = (
40-
'%s %s %s "%s;https://www.example.com" provName bar 2020-01-01 2023-12-04T07:00:00Z 2023-12-05 23:37:02.995+00:00 2023-12-05 07:00:00+00:00 [a,b,c,d] [2020-01-01,2023-12-04T07:00:00.111Z,2023-12-05 23:37:02.333+00:00,2023-12-05 07:00:00+00:00] [fAlSe,False,tRuE,True] [1,2,3,4] [1.2,3.4,5.6,7.8] ["my, string with a comma","another, string with a comma"]\n'
40+
'%s %s %s "%s;https://www.example.com" provName bar 2020-01-01 2023-12-04T07:00:00Z 2023-12-05 23:37:02.995+00:00 2023-12-05 07:00:00+00:00 [a, b,c, d] [2020-01-01,2023-12-04T07:00:00.111Z, 2023-12-05 23:37:02.333+00:00,2023-12-05 07:00:00+00:00] [fAlSe,False, tRuE,True ] [1,2,3,4] [1.2,3.4,5.6, 7.8] ["my, string with a comma", "another, string with a comma" ]\n'
4141
% (
4242
self.f1,
4343
self.project.id,

tests/unit/synapseutils/unit_test_synapseutils_sync.py

Lines changed: 215 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
import random
99
import tempfile
1010
import threading
11+
import math
1112

1213
import pytest
1314
from unittest.mock import ANY, patch, create_autospec, Mock, call, MagicMock
@@ -1415,3 +1416,217 @@ def test_generate_sync_manifest(syn):
14151416
patch_write_manifest_data.assert_called_with(
14161417
manifest_path, ["path", "parent"], patch_walk_directory_tree.return_value
14171418
)
1419+
1420+
1421+
class TestConvertCellInManifestToPythonTypes(object):
1422+
"""
1423+
Test the _convert_cell_in_manifest_to_python_types function for each type
1424+
and single/multiple values.
1425+
"""
1426+
1427+
def test_datetime_single_item(self) -> None:
1428+
# GIVEN a single item datetime string
1429+
datetime_string = "2020-01-01T00:00:00.000Z"
1430+
datetime_string_in_brackets = "[2020-01-01T00:00:00.000Z]"
1431+
1432+
# WHEN _convert_cell_in_manifest_to_python_types is called
1433+
# THEN I expect the output to be a datetime object
1434+
assert synapseutils.sync._convert_cell_in_manifest_to_python_types(
1435+
datetime_string
1436+
) == datetime.datetime(2020, 1, 1, 0, 0, 0, 0, tzinfo=datetime.timezone.utc)
1437+
assert synapseutils.sync._convert_cell_in_manifest_to_python_types(
1438+
datetime_string_in_brackets
1439+
) == datetime.datetime(2020, 1, 1, 0, 0, 0, 0, tzinfo=datetime.timezone.utc)
1440+
1441+
def test_datetime_multiple_items(self) -> None:
1442+
# GIVEN a multiple item datetime string
1443+
datetime_string = "[2020-01-01T00:00:00.000Z, 2020-01-02T00:00:00.000Z]"
1444+
1445+
# WHEN _convert_cell_in_manifest_to_python_types is called
1446+
# THEN I expect the output to be a list of datetime objects
1447+
assert synapseutils.sync._convert_cell_in_manifest_to_python_types(
1448+
datetime_string
1449+
) == [
1450+
datetime.datetime(2020, 1, 1, 0, 0, 0, 0, tzinfo=datetime.timezone.utc),
1451+
datetime.datetime(2020, 1, 2, 0, 0, 0, 0, tzinfo=datetime.timezone.utc),
1452+
]
1453+
1454+
def test_bool_single_item(self) -> None:
1455+
# GIVEN a single item bool string
1456+
bool_string = "TrUe"
1457+
bool_string_in_brackets = "[tRue]"
1458+
1459+
# WHEN _convert_cell_in_manifest_to_python_types is called
1460+
# THEN I expect the output to be a bool object
1461+
assert (
1462+
synapseutils.sync._convert_cell_in_manifest_to_python_types(bool_string)
1463+
is True
1464+
)
1465+
assert (
1466+
synapseutils.sync._convert_cell_in_manifest_to_python_types(
1467+
bool_string_in_brackets
1468+
)
1469+
is True
1470+
)
1471+
1472+
def test_bool_multiple_items(self) -> None:
1473+
# GIVEN a multiple item bool string
1474+
bool_string = "[TrUe, fAlse]"
1475+
1476+
# WHEN _convert_cell_in_manifest_to_python_types is called
1477+
# THEN I expect the output to be a list of bool objects
1478+
assert synapseutils.sync._convert_cell_in_manifest_to_python_types(
1479+
bool_string
1480+
) == [True, False]
1481+
1482+
def test_int_single_item(self) -> None:
1483+
# GIVEN a single item int string
1484+
int_string = "123"
1485+
int_string_in_brackets = "[123]"
1486+
1487+
# WHEN _convert_cell_in_manifest_to_python_types is called
1488+
# THEN I expect the output to be a int object
1489+
assert (
1490+
synapseutils.sync._convert_cell_in_manifest_to_python_types(int_string)
1491+
== 123
1492+
)
1493+
assert (
1494+
synapseutils.sync._convert_cell_in_manifest_to_python_types(
1495+
int_string_in_brackets
1496+
)
1497+
== 123
1498+
)
1499+
1500+
def test_int_multiple_items(self) -> None:
1501+
# GIVEN a multiple item int string
1502+
int_string = "[123, 456]"
1503+
1504+
# WHEN _convert_cell_in_manifest_to_python_types is called
1505+
# THEN I expect the output to be a list of int objects
1506+
assert synapseutils.sync._convert_cell_in_manifest_to_python_types(
1507+
int_string
1508+
) == [123, 456]
1509+
1510+
def test_float_single_item(self) -> None:
1511+
# GIVEN a single item float string
1512+
float_string = "123.456"
1513+
float_string_in_brackets = "[123.456]"
1514+
1515+
# WHEN _convert_cell_in_manifest_to_python_types is called
1516+
# THEN I expect the output to be a float object
1517+
assert math.isclose(
1518+
synapseutils.sync._convert_cell_in_manifest_to_python_types(float_string),
1519+
123.456,
1520+
)
1521+
assert math.isclose(
1522+
synapseutils.sync._convert_cell_in_manifest_to_python_types(
1523+
float_string_in_brackets
1524+
),
1525+
123.456,
1526+
)
1527+
1528+
def test_float_multiple_items(self) -> None:
1529+
# GIVEN a multiple item float string
1530+
float_string = " [123.456, 789.012]"
1531+
1532+
# WHEN _convert_cell_in_manifest_to_python_types is called
1533+
# THEN I expect the output to be a list of float objects
1534+
result = synapseutils.sync._convert_cell_in_manifest_to_python_types(
1535+
float_string
1536+
)
1537+
assert math.isclose(result[0], 123.456)
1538+
assert math.isclose(result[1], 789.012)
1539+
1540+
def test_string_single_item(self) -> None:
1541+
# GIVEN a single item string
1542+
string = " foo"
1543+
string_in_brackets = " [foo]"
1544+
1545+
# WHEN _convert_cell_in_manifest_to_python_types is called
1546+
# THEN I expect the output to be a string object
1547+
assert (
1548+
synapseutils.sync._convert_cell_in_manifest_to_python_types(string) == "foo"
1549+
)
1550+
assert (
1551+
synapseutils.sync._convert_cell_in_manifest_to_python_types(
1552+
string_in_brackets
1553+
)
1554+
== "foo"
1555+
)
1556+
1557+
def test_string_multiple_items(self) -> None:
1558+
# GIVEN a multiple item string
1559+
string = " [foo, bar]"
1560+
1561+
# WHEN _convert_cell_in_manifest_to_python_types is called
1562+
# THEN I expect the output to be a list of string objects
1563+
assert synapseutils.sync._convert_cell_in_manifest_to_python_types(string) == [
1564+
"foo",
1565+
"bar",
1566+
]
1567+
1568+
def test_string_single_item_with_comma(self) -> None:
1569+
# GIVEN a single item string
1570+
string = "my, sentence, with, commas"
1571+
1572+
# WHEN _convert_cell_in_manifest_to_python_types is called
1573+
# THEN I expect the output to be a string object
1574+
assert (
1575+
synapseutils.sync._convert_cell_in_manifest_to_python_types(string)
1576+
== string
1577+
)
1578+
1579+
1580+
class TestSplitString(object):
1581+
"""
1582+
Test the _split_string function. Note as a pre-check to calling this function
1583+
the string would have started with brackets `[]`, but they were removed before
1584+
calling this function.
1585+
"""
1586+
1587+
def test_single_item(self) -> None:
1588+
# GIVEN single item strings
1589+
string_with_no_quotes = "foo"
1590+
string_with_quotes = '"foo"'
1591+
string_with_quotes_inside_string = 'foo "bar" baz'
1592+
string_with_commas_inside_string = '"foo, bar, baz"'
1593+
1594+
# WHEN split_strings is called
1595+
1596+
# THEN I expect the output to treat all values as a single item
1597+
assert synapseutils.sync._split_string(string_with_no_quotes) == [
1598+
string_with_no_quotes
1599+
]
1600+
assert synapseutils.sync._split_string(string_with_quotes) == [
1601+
string_with_quotes
1602+
]
1603+
assert synapseutils.sync._split_string(string_with_quotes_inside_string) == [
1604+
string_with_quotes_inside_string
1605+
]
1606+
assert synapseutils.sync._split_string(string_with_commas_inside_string) == [
1607+
string_with_commas_inside_string
1608+
]
1609+
1610+
def test_multiple_item(self) -> None:
1611+
# GIVEN multiple item strings
1612+
string_with_no_quotes = "foo, bar, baz"
1613+
string_with_quotes = '"foo", "bar", "baz"'
1614+
string_with_commas_in_some_items = '"foo, bar", baz, "foo, bar, baz"'
1615+
1616+
# WHEN split_strings is called
1617+
# THEN I expect the output to split the string into multiple items
1618+
assert synapseutils.sync._split_string(string_with_no_quotes) == [
1619+
"foo",
1620+
"bar",
1621+
"baz",
1622+
]
1623+
assert synapseutils.sync._split_string(string_with_quotes) == [
1624+
'"foo"',
1625+
'"bar"',
1626+
'"baz"',
1627+
]
1628+
assert synapseutils.sync._split_string(string_with_commas_in_some_items) == [
1629+
'"foo, bar"',
1630+
"baz",
1631+
'"foo, bar, baz"',
1632+
]

0 commit comments

Comments
 (0)