Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -396,18 +396,14 @@ public class JsonNodeConvertingCodecProvider implements ConvertingCodecProvider
return new JsonNodeToDateRangeCodec(nullStrings);
case DefaultVectorType.VECTOR_CLASS_NAME:
VectorType vectorType = (VectorType) cqlType;
// Step 1: create a JSON codec which will take the input JSON nodes and generate
// something matching the expected data type
ConvertingCodec<JsonNode, ?> jsonCodec =
// Parser for JSON leaf nodes, each of which represents a value of the vector subtype
ConvertingCodec<JsonNode, ?> leafCodec =
createJsonNodeConvertingCodec(vectorType.getElementType(), codecFactory, false);
// Step 2: create a conventional codec which will take instances of the Java type
// generated by the JSON codec above and perform standard serde on them.
ConvertingCodec<?, ?> standardCodec =
codecFactory.createConvertingCodec(
vectorType.getElementType(), jsonCodec.getInternalJavaType(), false);
return new JsonNodeToVectorCodec(
new VectorCodec(vectorType, standardCodec),
jsonCodec,
new VectorCodec(
vectorType,
codecFactory.getCodecRegistry().codecFor(vectorType.getElementType())),
leafCodec,
context.getAttribute(OBJECT_MAPPER),
nullStrings);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,16 +29,16 @@
public class JsonNodeToVectorCodec<SubtypeT extends Number>
extends JsonNodeConvertingCodec<CqlVector<SubtypeT>> {

private final ConvertingCodec<JsonNode, SubtypeT> subtypeCodec;
private final ConvertingCodec<JsonNode, SubtypeT> leafCodec;
private final ObjectMapper objectMapper;

public JsonNodeToVectorCodec(
VectorCodec<SubtypeT> targetCodec,
ConvertingCodec<JsonNode, SubtypeT> subtypeCodec,
ConvertingCodec<JsonNode, SubtypeT> leafCodec,
ObjectMapper objectMapper,
List<String> nullStrings) {
super(targetCodec, nullStrings);
this.subtypeCodec = subtypeCodec;
this.leafCodec = leafCodec;
this.objectMapper = objectMapper;
}

Expand All @@ -47,7 +47,7 @@ public CqlVector<SubtypeT> externalToInternal(JsonNode jsonNode) {
if (jsonNode == null || !jsonNode.isArray()) return null;
List<SubtypeT> elems =
Streams.stream(jsonNode.elements())
.map(e -> subtypeCodec.externalToInternal(e))
.map(e -> leafCodec.externalToInternal(e))
.collect(Collectors.toCollection(ArrayList::new));
return CqlVector.newInstance(elems);
}
Expand All @@ -57,7 +57,7 @@ public JsonNode internalToExternal(CqlVector<SubtypeT> value) {
if (value == null) return null;
ArrayNode root = objectMapper.createArrayNode();
for (SubtypeT element : value) {
root.add(subtypeCodec.internalToExternal(element));
root.add(leafCodec.internalToExternal(element));
}
return root;
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -335,11 +335,15 @@ public class StringConvertingCodecProvider implements ConvertingCodecProvider {
return new StringToDateRangeCodec(nullStrings);
case DefaultVectorType.VECTOR_CLASS_NAME:
VectorType vectorType = (VectorType) cqlType;
return new StringToVectorCodec(
new VectorCodec(
VectorCodec<Number> vectorCodec =
new VectorCodec<>(
vectorType,
codecFactory.getCodecRegistry().codecFor(vectorType.getElementType())),
nullStrings);
codecFactory.getCodecRegistry().codecFor(vectorType.getElementType()));
ConvertingCodec<JsonNode, List<Number>> jsonCodec =
codecFactory.createConvertingCodec(
DataTypes.listOf(vectorType.getElementType()), JSON_NODE_TYPE, false);
return new StringToVectorCodec<>(
vectorCodec, jsonCodec, context.getAttribute(OBJECT_MAPPER), nullStrings);
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

See StringToVectorCodec changes below. jsonCodec is here to convert raw string values into Lists; StringToVectorCodec builds CqlVectors out of them.

}
}
// fall through
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,22 +17,57 @@

import com.datastax.oss.driver.api.core.data.CqlVector;
import com.datastax.oss.driver.internal.core.type.codec.VectorCodec;
import com.datastax.oss.dsbulk.codecs.api.ConvertingCodec;
import com.datastax.oss.dsbulk.codecs.text.utils.StringUtils;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import java.io.IOException;
import java.util.List;
import java.util.stream.Collectors;

public class StringToVectorCodec<SubtypeT extends Number>
extends StringConvertingCodec<CqlVector<SubtypeT>> {

public StringToVectorCodec(VectorCodec<SubtypeT> targetCodec, List<String> nullStrings) {
private final ConvertingCodec<JsonNode, List<SubtypeT>> jsonCodec;
private final ObjectMapper objectMapper;

public StringToVectorCodec(
VectorCodec<SubtypeT> targetCodec,
ConvertingCodec<JsonNode, List<SubtypeT>> jsonCodec,
ObjectMapper objectMapper,
List<String> nullStrings) {
super(targetCodec, nullStrings);
this.jsonCodec = jsonCodec;
this.objectMapper = objectMapper;
}

@Override
public CqlVector<SubtypeT> externalToInternal(String s) {
return this.internalCodec.parse(s);
if (isNullOrEmpty(s)) {
return null;
}
try {
JsonNode node = objectMapper.readTree(StringUtils.ensureBrackets(s));
List<SubtypeT> vals = jsonCodec.externalToInternal(node);
return CqlVector.newInstance(vals);
Copy link
Collaborator Author

@absurdfarce absurdfarce Jul 10, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Use JSON codecs to eval input strings as JSON, build a list from that and then build a CqlVector from that list. This makes behaviour of the vector codec consistent with codecs for the collection types by enforcing a common policy around string representations of these types (i.e. they have to be JSON-friendly).

Idea (and implementation) provided by @adutra

} catch (IOException e) {
throw new IllegalArgumentException(String.format("Could not parse '%s' as Json", s), e);
}
}

@Override
public String internalToExternal(CqlVector<SubtypeT> cqlVector) {
return this.internalCodec.format(cqlVector);
if (cqlVector == null) {
return nullString();
}
try {
List<SubtypeT> vals = cqlVector.stream().collect(Collectors.toList());
JsonNode node = jsonCodec.internalToExternal(vals);
return objectMapper.writeValueAsString(node);
} catch (JsonProcessingException e) {
throw new IllegalArgumentException(
String.format("Could not format '%s' to Json", cqlVector), e);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -104,4 +104,16 @@ void should_encode_too_many_but_not_too_few() {
assertThatThrownBy(() -> dsbulkCodec.encode(tooFewNode, ProtocolVersion.DEFAULT))
.isInstanceOf(IllegalArgumentException.class);
}

/* Issue 484: now that we're using the dsbulk string-to-subtype converters we should get
* enforcement of existing dsbulk policies. For our purposes that means the failure on
* arithmetic overflow */
@Test
void should_not_convert_too_much_precision() {
ArrayNode tooPreciseNode = JSON_NODE_FACTORY.arrayNode();
tooPreciseNode.add(JSON_NODE_FACTORY.numberNode(6.646329843));
assertThat(dsbulkCodec).cannotConvertFromInternal(tooPreciseNode);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm I don't get why you are trying to use dsbulkCodec to convert from an internal type that would be... JsonNode?

The internal type of this codec is CqlVector, so calling cannotConvertFromInternal would make sense only
if you had some instance of CqlVector that is somehow "invalid" – maybe a CqlVector with the wrong number of dimensions, or something like that.

But calling cannotConvertFromInternal(tooPreciseNode) does not make sense to me. It's only possible, btw, because dsbulkCodec is of the raw type JsonNodeToVectorCodec. It should be JsonNodeToVectorCodec<Float> – but in that case, I bet this statement wouldn't compile anymore.

If you are trying to check whether the external type tooPreciseNode causes a runtime error, then call cannotConvertFromExternal(tooPreciseNode) – I fixed something similar for StringToVectorCodecTest, see test should_not_convert_from_invalid_external.

assertThatThrownBy(() -> dsbulkCodec.encode(tooPreciseNode, ProtocolVersion.DEFAULT))
.isInstanceOf(ArithmeticException.class);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -22,26 +22,45 @@
import com.datastax.oss.driver.api.core.data.CqlVector;
import com.datastax.oss.driver.api.core.type.DataTypes;
import com.datastax.oss.driver.api.core.type.codec.TypeCodecs;
import com.datastax.oss.driver.api.core.type.reflect.GenericType;
import com.datastax.oss.driver.internal.core.type.DefaultVectorType;
import com.datastax.oss.driver.internal.core.type.codec.VectorCodec;
import com.datastax.oss.driver.shaded.guava.common.collect.Lists;
import com.datastax.oss.dsbulk.codecs.api.ConversionContext;
import com.datastax.oss.dsbulk.codecs.api.ConvertingCodecFactory;
import com.datastax.oss.dsbulk.codecs.text.TextConversionContext;
import java.util.ArrayList;
import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Test;

public class StringToVectorCodecTest {

private final ArrayList<Float> values = Lists.newArrayList(1.1f, 2.2f, 3.3f, 4.4f, 5.5f);
private final CqlVector vector = CqlVector.newInstance(values);
private final VectorCodec vectorCodec =
new VectorCodec(new DefaultVectorType(DataTypes.FLOAT, 5), TypeCodecs.FLOAT);
private final CqlVector<Float> vector = CqlVector.newInstance(values);
private final VectorCodec<Float> vectorCodec =
new VectorCodec<>(new DefaultVectorType(DataTypes.FLOAT, 5), TypeCodecs.FLOAT);

private final StringToVectorCodec dsbulkCodec =
new StringToVectorCodec(vectorCodec, Lists.newArrayList("NULL"));
private StringToVectorCodec<Float> codec;

@BeforeEach
void setUp() {
ConversionContext context = new TextConversionContext().setNullStrings("NULL");
ConvertingCodecFactory codecFactory = new ConvertingCodecFactory(context);
codec =
(StringToVectorCodec<Float>)
codecFactory.<String, CqlVector<Float>>createConvertingCodec(
DataTypes.vectorOf(DataTypes.FLOAT, 5), GenericType.STRING, true);
}
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a much cleaner way to get to a workable ConversionContext! 👍


@Test
void should_convert_from_valid_external() {
assertThat(dsbulkCodec)
.convertsFromExternal(vectorCodec.format(vector)) // standard pattern
assertThat(codec)
.convertsFromExternal(
vectorCodec.format(vector)) // CQL representation is parsable as a json array
.toInternal(vector)
.convertsFromExternal("[1.1,2.2,3.3,4.4,5.5]")
.toInternal(vector)
.convertsFromExternal("[1.1000,2.2000,3.3000,4.4000,5.5000]")
.toInternal(vector)
.convertsFromExternal("")
.toInternal(null)
Expand All @@ -53,39 +72,46 @@ void should_convert_from_valid_external() {

@Test
void should_convert_from_valid_internal() {
assertThat(dsbulkCodec)
assertThat(codec)
.convertsFromInternal(vector)
.toExternal(vectorCodec.format(vector))
.toExternal(
"[1.1,2.2,3.3,4.4,5.5]") // this is NOT 100% identical to vector CQL representation
.convertsFromInternal(null)
.toExternal("NULL");

// We should encode
}

@Test
void should_not_convert_from_invalid_internal() {
assertThat(dsbulkCodec).cannotConvertFromInternal("not a valid vector");
void should_not_convert_from_invalid_external() {
assertThat(codec).cannotConvertFromExternal("[6.646329843]");
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This effectively winds up duplicating should_not_convert_too_much_precision() in a way that isn't very clear. The original intent of this method was to perform something similar to JsonNodeToVectorCodecTest.should_not_convert_from_invalid_internal(), specifically given something that isn't a CqlVector this method fails completely. We could certainly add a few more cases but I'd argue it's worthwhile to preserve the symmetry.

}

// To keep usage consistent with VectorCodec we confirm that we support encoding when too many
// elements are
// available but not when too few are. Note that it's actually VectorCodec that enforces this
// constraint so we
// have to go through encode() rather than the internal/external methods.
// elements are available but not when too few are. Note that it's actually VectorCodec that
// enforces this constraint so we have to go through encode() rather than the internal/external
// methods.
@Test
void should_encode_too_many_but_not_too_few() {

ArrayList<Float> tooMany = Lists.newArrayList(values);
tooMany.add(6.6f);
CqlVector<Float> tooManyVector = CqlVector.newInstance(tooMany);
String tooManyString = dsbulkCodec.internalToExternal(tooManyVector);
String tooManyString = codec.internalToExternal(tooManyVector);
ArrayList<Float> tooFew = Lists.newArrayList(values);
tooFew.remove(0);
CqlVector<Float> tooFewVector = CqlVector.newInstance(tooFew);
String tooFewString = dsbulkCodec.internalToExternal(tooFewVector);
String tooFewString = codec.internalToExternal(tooFewVector);

assertThat(dsbulkCodec.encode(tooManyString, ProtocolVersion.DEFAULT)).isNotNull();
assertThatThrownBy(() -> dsbulkCodec.encode(tooFewString, ProtocolVersion.DEFAULT))
assertThat(codec.encode(tooManyString, ProtocolVersion.DEFAULT)).isNotNull();
assertThatThrownBy(() -> codec.encode(tooFewString, ProtocolVersion.DEFAULT))
.isInstanceOf(IllegalArgumentException.class);
}

// Issue 484: now that we're using the dsbulk string-to-subtype converters we should get
// enforcement of existing dsbulk policies. For our purposes that means the failure on
// arithmetic overflow.
@Test
void should_not_convert_too_much_precision() {
assertThatThrownBy(() -> codec.encode("6.646329843", ProtocolVersion.DEFAULT))
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'd argue this is incorrect. This test is intended to mirror the equivalent test in JsonNodeToVectorCodecTest. In that case we're trying to confirm that the JSON representation for an otherwise valid vector (really just a JSON array in that case) fails to convert because precision policies in the dsbulk codecs are being enforced. If we want to model the same thing here this should be the string representation for an otherwise valid vector... which means it should be something like:

  // Issue 484: now that we're using the dsbulk string-to-subtype converters we should get
  // enforcement of existing dsbulk policies.  For our purposes that means the failure on
  // arithmetic overflow.
  @Test
  void should_not_convert_too_much_precision() {
    assertThatThrownBy(() -> codec.encode("[1.1, 2.2, 3.3, 6.646329843]", ProtocolVersion.DEFAULT))
        .isInstanceOf(ArithmeticException.class);
  }

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Your example if fine, but the current one is too. In DSBulk, enclosing brackets and braces are generally optional. So codec.encode("6.646329843", ProtocolVersion.DEFAULT)) should behave like codec.encode("[6.646329843]", ProtocolVersion.DEFAULT)). You can add both tests btw.

But note that this test is almost identical to should_not_convert_from_invalid_external. Maybe merge both into one single test?

.isInstanceOf(ArithmeticException.class);
}
}