Logo Questions Linux Laravel Mysql Ubuntu Git Menu
 

How to split a large JSON file based on an array property which is deeply nested?

I have a large json file (around 16Gb) with the following structure:

{
  "Job": {
    "Keys": {
      "JobID": "test123",
      "DeviceID": "TEST01"
    },
    "Props": {
      "FileType": "Measurements",
      "InstrumentDescriptions": [
        {
          "InstrumentID": "1723007",
          "InstrumentType": "Actual1",
          "Name": "U",
          "DataType": "Double",
          "Units": "degC"
        },
        {
          "InstrumentID": "2424009",
          "InstrumentType": "Actual2",
          "Name": "VG03",
          "DataType": "Double",
          "Units": "Pa"
        }
      ]
    },
    "Steps": [
      {
        "Keys": {
          "StepID": "START",
          "StepResult": "NormalEnd"
        },
        "InstrumentData": [
          {
            "Keys": {
              "InstrumentID": "1723007"
            },
            "Measurements": [
              {
                "DateTime": "2021-11-16 21:18:37.000",
                "Value": 540
              },
              {
                "DateTime": "2021-11-16 21:18:37.100",
                "Value": 539
              },
              {
                "DateTime": "2021-11-16 21:18:37.200",
                "Value": 540
              },
              {
                "DateTime": "2021-11-16 21:18:37.300",
                "Value": 540
              },
              {
                "DateTime": "2021-11-16 21:18:37.400",
                "Value": 540
              },
              {
                "DateTime": "2021-11-16 21:18:37.500",
                "Value": 540
              },
              {
                "DateTime": "2021-11-16 21:18:37.600",
                "Value": 540
              },
              {
                "DateTime": "2021-11-16 21:18:37.700",
                "Value": 538
              },
              {
                "DateTime": "2021-11-16 21:18:37.800",
                "Value": 540
              }
            ]
          },
          {
            "Keys": {
              "InstrumentID": "2424009"
            },
            "Measurements": [
              {
                "DateTime": "2021-11-16 21:18:37.000",
                "Value": 1333.22
              },
              {
                "DateTime": "2021-11-16 21:18:37.100",
                "Value": 1333.22
              },
              {
                "DateTime": "2021-11-16 21:18:37.200",
                "Value": 1333.22
              },
              {
                "DateTime": "2021-11-16 21:18:37.300",
                "Value": 1333.22
              },
              {
                "DateTime": "2021-11-16 21:18:37.400",
                "Value": 1333.22
              },
              {
                "DateTime": "2021-11-16 21:18:37.500",
                "Value": 1333.22
              },
              {
                "DateTime": "2021-11-16 21:18:37.600",
                "Value": 1333.22
              },
              {
                "DateTime": "2021-11-16 21:18:37.700",
                "Value": 1333.22
              },
              {
                "DateTime": "2021-11-16 21:18:37.800",
                "Value": 1333.22
              }
            ]
          }
        ]
      }
    ]
  }
}

The problem

I would like to split this file into multiple files by splitting the array "InstrumentData" because this array will be holding the major chunk of the data. Splitting this file into smaller files would enable me to parse the file without getting an out of memory exception.

Current State

public static void SplitJson(string filename, string arrayPropertyName)
    {
        string templateFileName = @"C:\Temp\template.json";
        string arrayFileName = @"C:\Temp\array.json";

        CreateEmptyFile(templateFileName);
        CreateEmptyFile(arrayFileName);

        using (Stream stream = File.OpenRead(filename))
        using (JsonReader reader = new JsonTextReader(new StreamReader(stream)))
        using (JsonWriter templateWriter = new JsonTextWriter(new StreamWriter(templateFileName)))
        using (JsonWriter arrayWriter = new JsonTextWriter(new StreamWriter(arrayFileName)))
        {
            if (reader.Read() && reader.TokenType == JsonToken.StartObject)
            {
                templateWriter.WriteStartObject();
                while (reader.Read() && reader.TokenType != JsonToken.EndObject)
                {
                    string propertyName = (string)reader.Value;
                    reader.Read();
                    templateWriter.WritePropertyName(propertyName);
                    if (propertyName == arrayPropertyName)
                    {
                        arrayWriter.WriteToken(reader);
                        templateWriter.WriteStartObject();  // empty placeholder object
                        templateWriter.WriteEndObject();
                    }
                    else if (reader.TokenType == JsonToken.StartObject ||
                             reader.TokenType == JsonToken.StartArray)
                    {
                        templateWriter.WriteToken(reader);
                    }
                    else
                    {
                        templateWriter.WriteValue(reader.Value);
                    }
                }
                templateWriter.WriteEndObject();
            }
        }

        // Now read the huge array file and combine each item in the array
        // with the template to make new files
        JObject template = JObject.Parse(File.ReadAllText(templateFileName));
        using (JsonReader arrayReader = new JsonTextReader(new StreamReader(arrayFileName)))
        {
            int counter = 0;
            while (arrayReader.Read())
            {
                if (arrayReader.TokenType == JsonToken.StartObject)
                {
                    counter++;
                    JObject item = JObject.Load(arrayReader);
                    template[arrayPropertyName] = item;
                    string fileName = string.Format(@"C:\Temp\output_{0}_{1}_{2}.json",
                                                    template["name"], template["age"], counter);

                    File.WriteAllText(fileName, template.ToString());
                }
            }
        }

        // Clean up temporary files
        File.Delete(templateFileName);
        File.Delete(arrayFileName);
    }

I am using this method to try and split the file into smaller files. However, this method can only split the files based on properties which are in the root level.

The question

Am I in the right track to tackle this problem? Is this an efficient way to tackle this? How do I split the JSON into multiple files by splitting the array in an efficient way? The JSON file should be split in a way that there is one file for each of the element in "InstrumentData" array. All the other properties and structures should be retained in the splitted files.

like image 438
mkr231 Avatar asked Oct 23 '25 19:10

mkr231


1 Answers

It's not clear from your question what you mean by splitting the JSON by splitting the array "InstrumentData". The array in question is located at the path "Job.Steps[*].InstrumentData[*]", so are you also effectively going to split the containing array "Job.Steps[*]" as well? And what about prefix and postfix properties such as "Job.Keys" -- what do you want to do with them?

One approach to defining and implementing your split would be to adopt the approach from Strategy for splitting a large JSON file, suitably generalized for nested arrays. In that question, prefix and postfix properties were retained in each split file while the array to be split would be divided up in chunks. In that question the array property was at the root level, but in your case you need to specify a path to the array property. And if the array to be split is nested deeply within some other array values, those will need to get split as well.

Assuming this is what you want, the following extension method should do the trick:

public static partial class JsonExtensions
{
    public static string [] SplitJsonFile(string fileName, string [] splitPath, Func<string, string, int, string, string> nameCreator)
    {
        List<string> fileNames = new List<string>();
        
        var name = Path.GetFileNameWithoutExtension(fileName);
        var ext = Path.GetExtension(fileName);
        var directory = Path.GetDirectoryName(fileName);
        Func<int, TextWriter> createStream = (i) => 
        {
            // Use whatever method you like to generate a name for each fragment.
            var newName = nameCreator(directory, name, i, ext);
            var writer = new StreamWriter(newName, false, Encoding.UTF8);
            fileNames.Add(newName);
            return writer;
        };

        using (var reader = new StreamReader(fileName, Encoding.UTF8))
        {
            JsonExtensions.SplitJson(reader,splitPath, 1, createStream, Formatting.Indented);
        }

        return fileNames.ToArray();
    }
    
    public static void SplitJson(TextReader textReader, IList<string> splitPath, long maxItems, Func<int, TextWriter> createStream, Formatting formatting)
    {
        if (splitPath == null || createStream == null || textReader == null)
            throw new ArgumentNullException();
        if (splitPath.Count < 1 || maxItems < 1)
            throw new ArgumentException();
        using (var reader = new JsonTextReader(textReader))
        {
            List<JsonWriter> writers = new ();
            List<ParentToken> parentTokens = new ();
            try
            {
                SplitJson(reader, splitPath, 0, maxItems, createStream, formatting, parentTokens, writers);
            }
            finally
            {
                // Make sure files are closed in the event of an exception.
                foreach (IDisposable writer in writers)
                    writer?.Dispose();
            }
        }
    }
    
    struct ParentToken
    {
        public ParentToken(JsonToken tokenType, IList<JToken> prefixTokens = default) => (this.TokenType, this._prefixTokens) = (tokenType, prefixTokens);
        readonly IList<JToken> _prefixTokens;
        public JsonToken TokenType { get; }
        public IList<JToken> PrefixTokens => _prefixTokens ?? Array.Empty<JToken>();
    }
    
    static JsonWriter AddWriter(List<JsonWriter> writers, List<ParentToken> parentTokens, Func<int, TextWriter> createStream, Formatting formatting)
    {
        var writer = new JsonTextWriter(createStream(writers.Count)) { Formatting = formatting, AutoCompleteOnClose = false };
        writers.Add(writer);
        foreach (var parent in parentTokens)
        {
            switch (parent.TokenType)
            {
                case JsonToken.StartObject:
                    writer.WriteStartObject();
                    break;
                case JsonToken.StartArray:
                    writer.WriteStartArray();
                    break;
                default:
                    throw new JsonException();
            }
            for (int i = 0; i < parent.PrefixTokens.Count; i++)
            {
                if (i == parent.PrefixTokens.Count - 1 && parent.PrefixTokens[i] is JProperty property && property.Value.Type == JTokenType.Undefined)
                    writer.WritePropertyName(property.Name);
                else
                    parent.PrefixTokens[i].WriteTo(writer);
            }
        }
        return writer;
    }
    
    static (JsonWriter, int) GetCurrentWriter(List<JsonWriter> writers, List<ParentToken> parentTokens, Func<int, TextWriter> createStream, Formatting formatting)
        => writers.Count == 0 ? (AddWriter(writers, parentTokens, createStream, formatting), 0) : (writers[writers.Count-1], writers.Count-1);
    
    static void SplitJson(JsonTextReader reader, IList<string> splitPath, int index, long maxItems, Func<int, TextWriter> createStream, Formatting formatting, List<ParentToken> parentTokens , List<JsonWriter> writers)
    {
        var startTokenType = reader.MoveToContentAndAssert().TokenType;
        var startReaderDepth = reader.Depth;

        var bottom = index >= splitPath.Count;
        
        switch (startTokenType)
        {
            case JsonToken.StartObject:
                {
                    (var firstWriter, var firstWriterIndex) = GetCurrentWriter(writers, parentTokens, createStream, formatting);
                    bool prefix = true;
                    bool doRead = true;
                    firstWriter.WriteStartObject();
                    var parentToken = new ParentToken(JsonToken.StartObject, new List<JToken>());
                    while ((doRead ? reader.ReadToContentAndAssert() : reader.MoveToContentAndAssert()).TokenType != JsonToken.EndObject)
                    {
                        doRead = true;
                        var propertyName = (string)reader.AssertTokenType(JsonToken.PropertyName).Value;
                        if (propertyName == splitPath[index])
                        {
                            if (!prefix)
                                throw new JsonException(string.Format("Duplicated property name {0}", propertyName));
                            prefix = false;
                            
                            // Advance reader to value.
                            reader.ReadToContentAndAssert();
                            
                            // Add a token with the current property name but an undefined value.  This indicates an unclosed property.
                            firstWriter.WritePropertyName(propertyName);
                            parentToken.PrefixTokens.Add(new JProperty(propertyName, JValue.CreateUndefined()));
                            
                            parentTokens.Add(parentToken);
                            
                            // SplitJson() leaves the reader positioned ON the end of the token that was read, rather than after.
                            SplitJson(reader, splitPath, index + 1, maxItems, createStream, formatting, parentTokens, writers);
                            parentTokens.RemoveAt(parentTokens.Count-1);
                        }
                        else if (prefix)
                        {
                            // JProperty.Load() leaves the reader positioned AFTER the token that was read, rather than at the end.
                            var property = JProperty.Load(reader);
                            property.WriteTo(firstWriter);
                            parentToken.PrefixTokens.Add(property);
                            doRead = false;
                        }
                        else
                        {
                            var property = JProperty.Load(reader);
                            for (int i = firstWriterIndex; i < writers.Count; i++)
                            {
                                property.WriteTo(writers[i]);
                            }
                            doRead = false;
                        }
                    }
                    for (int i = firstWriterIndex; i < writers.Count; i++)
                    {
                        if (prefix)
                            // We never found the property
                            foreach (var property in parentToken.PrefixTokens)
                                property.WriteTo(writers[i]);
                        writers[i].WriteEndObject();
                    }
                }
                break;
            case JsonToken.StartArray: // Split the array.
                {
                    var maxItemsAtDepth = bottom ? maxItems : 1L;
                    (var writer, var firstWriterIndex) = GetCurrentWriter(writers, parentTokens, createStream, formatting);
                    writer.WriteStartArray();
                    long count = 0L;
                    while (reader.ReadToContentAndAssert().TokenType != JsonToken.EndArray)
                    {
                        if (reader.TokenType == JsonToken.Comment || reader.TokenType == JsonToken.None)
                            continue;
                        if (count >= maxItemsAtDepth)
                        {
                            writer = AddWriter(writers, parentTokens, createStream, formatting);
                            writer.WriteStartArray();
                            count = 0L;
                        }
                        if (bottom)
                            // WriteToken() leaves the reader positioned ON the end of the token that was read, rather than after.
                            writer.WriteToken(reader);
                        else
                        {
                            parentTokens.Add(new ParentToken(JsonToken.StartArray));
                            // SplitJson() leaves the reader positioned ON the end of the token that was read, rather than after.
                            SplitJson(reader, splitPath, index, maxItems, createStream, formatting, parentTokens, writers);
                            parentTokens.RemoveAt(parentTokens.Count-1);
                        }
                        count++;
                    }
                    for (int i = firstWriterIndex; i < writers.Count; i++)
                    {
                        writers[i].WriteEndArray();
                    }
                }
                break;
            default: // null, for instance
                {
                    (var writer, var _) = GetCurrentWriter(writers, parentTokens, createStream, formatting);
                    writer.WriteToken(reader);
                }
                break;
        }
    }       
}

public static partial class JsonExtensions
{
    public static JsonReader AssertTokenType(this JsonReader reader, JsonToken tokenType) => 
        reader.TokenType == tokenType ? reader : throw new JsonSerializationException(string.Format("Unexpected token {0}, expected {1}", reader.TokenType, tokenType));
    
    public static JsonReader ReadToContentAndAssert(this JsonReader reader) =>
        reader.ReadAndAssert().MoveToContentAndAssert();

    public static JsonReader MoveToContentAndAssert(this JsonReader reader)
    {
        if (reader == null)
            throw new ArgumentNullException();
        if (reader.TokenType == JsonToken.None)       // Skip past beginning of stream.
            reader.ReadAndAssert();
        while (reader.TokenType == JsonToken.Comment) // Skip past comments.
            reader.ReadAndAssert();
        return reader;
    }

    public static JsonReader ReadAndAssert(this JsonReader reader)
    {
        if (reader == null)
            throw new ArgumentNullException();
        if (!reader.Read())
            throw new JsonReaderException("Unexpected end of JSON stream.");
        return reader;
    }
}

And then, to split on "Job.Steps[*].InstrumentData[*]", call it as follows:

var fileNames = JsonExtensions.SplitJsonFile(fileName, 
                                             new [] { "Job", "Steps", "InstrumentData" }, 
                                             (directory, name, i, ext) => Path.Combine(directory, Path.ChangeExtension(name + $"_fragment_{i}", ext)));

Alternatively, to split on "Job.Steps[*].InstrumentData[*].Measurements[*]", call as follows:

var fileNames = JsonExtensions.SplitJsonFile(fileName, 
                                             new [] { "Job", "Steps", "InstrumentData", "Measurements", 
                                             (directory, name, i, ext) => Path.Combine(directory, Path.ChangeExtension(name + $"_fragment_{i}", ext)));

Demo fiddle here.

I also use tested this enhanced version of JsonExtensions.SplitJson() using the JSON from Strategy for splitting a large JSON file to verify there was no regression; there was not. See fiddle #2 here.

like image 80
dbc Avatar answered Oct 26 '25 09:10

dbc