使用LINQ GroupBy链产生带有聚合的多层次层次结构

红润的

我碰巧做到了这一点,并想问一下有关O(n),订单稳定性和枚举器实例的幕后保证。

我了解每次聚合的枚举器拐点数(例如计数和持续时间)会根据子级到父级的实际分布而变化,但是每个记录仅对每个聚合仅枚举一次是不是真的?

在此示例中,我们有3个聚合和3个层次结构层次进行聚合,因此为O(9n)〜O(n)。

LINQ Group通过问题:

  1. 是GroupBy被证明既具有线性复杂性又具有稳定的顺序,或者仅仅是Microsoft .NET的隐式实现方式?我没有任何特别的怀疑,它可能是非线性的,只是出于好奇而问。
  2. 似乎实例化的枚举数应始终与层次结构中看到的父节点数成线性关系,对吗?
  3. 在相同的聚合统计和级别期间,不会多次枚举叶子或非叶子,是吗?因为每个节点只是某个“结果”中一个父级的子级。
    using System;
    using System.Collections.Generic;
    using System.Linq;
    using Microsoft.VisualStudio.TestTools.UnitTesting;

    namespace NestedGroupBy
    {
        [TestClass]
        public class UnitTest1
        {
            struct Record
            {
                // cat->subcat->event is the composite 3-tier key
                private readonly string category;
                private readonly string subcategory;
                private readonly string ev;
                private readonly double duration;

                public Record(string category, string subcategory, string ev, double duration)
                {
                    this.category = category;
                    this.subcategory = subcategory;
                    this.ev = ev;
                    this.duration = duration;
                }

                public string Category { get { return category; } }
                public string Subcategory { get { return subcategory; } }
                public string Event { get { return ev; } }
                public double Duration { get { return duration; } }
            }

            static readonly IList<Record> Rec1 = new System.Collections.ObjectModel.ReadOnlyCollection<Record>
            (new[] {
                 new Record ("Network", "SecurityCheck", "ReadAcl", 0.0145),
                 new Record ("Network", "Security", "ReadSocket", 0.0145),
                 new Record ("Network", "SecurityCheck", "ReadAcl", 0.0145),
                 new Record ("File", "ReadMetadata", "ReadDirectory", 0.0145),
                 new Record ("File", "ReadMetadata", "ReadDirectory", 0.0145),
                 new Record ("File", "ReadMetadata", "ReadDirectory", 0.0145),
                 new Record ("File", "ReadMetadata", "ReadSize", 0.0145),
                 new Record ("File", "ReadMetadata", "ReadSize", 0.0145),
                 new Record ("File", "ReadMetadata", "ReadSize", 0.0145),
                 new Record ("Registry", "ReadKey", "ReadKeyAcl", 0.0145),
                 new Record ("Registry", "ReadKey", "ReadKeyAcl", 0.0145),
                 new Record ("Registry", "ReadKey", "ReadKeyAcl", 0.0145),
                 new Record ("Registry", "ReadKey", "ReadKeyAcl", 0.0145),
                 new Record ("Registry", "ReadKey", "CacheKeyAcl", 0.0145),
                 new Record ("Registry", "ReadKey", "CacheKeyAcl", 0.0145),
                 new Record ("Registry", "ReadKey", "CheckKeyAcl", 0.0145),
                 new Record ("Registry", "ReadKey", "CheckKeyAcl", 0.0145),
                 new Record ("Registry", "ReadKey", "CheckKeyAcl", 0.0145),
                 new Record ("Registry", "WriteKey", "CheckPermissions", 0.0145),
                 new Record ("Registry", "WriteKey", "CheckOwner", 0.0145),
                 new Record ("Registry", "WriteKey", "InheritPermissions", 0.0145),
                 new Record ("Registry", "WriteKey", "ValidateKey", 0.0145),
                 new Record ("Registry", "WriteKey", "RecacheKey", 0.0145),
                 new Record ("File", "WriteData", "FlushData", 0.0145),
                 new Record ("File", "WriteData", "WriteBuffer", 0.0145),
                 new Record ("File", "WriteData", "WritePermissions", 0.0145),
                 new Record ("File", "ReadData", "CheckDataBuffer", 0.0145),
                 new Record ("File", "ReadData", "ReadBuffer", 0.0145),
                 new Record ("File", "ReadData", "ReadPermissions", 0.0145),
                 new Record ("Network", "SecurityCheck", "ReadAcl", 0.0145),
                 new Record ("Network", "SecurityCheck", "ReadAcl", 0.0145),
                 new Record ("Network", "SecurityCheck", "ReadAcl", 0.0145),
                 new Record ("Network", "SecurityCheck", "ReadAcl", 0.0145),
                 new Record ("Network", "SecurityCheck", "ReadAcl", 0.0145),
                 new Record ("Network", "Security", "ReadSocket", 0.0145),
                 new Record ("Network", "SecurityCheck", "ReadAcl", 0.0145),
                 new Record ("Network", "SecurityCheck", "ReadAcl", 0.0145),
                 new Record ("Network", "Security", "ReadSocket", 0.0145),
                 new Record ("Network", "SecurityCheck", "ReadAcl", 0.0145),
            });

            [TestMethod]
            public void TestMethod1()
            {
                // Perform one big sort to order all child rungs properly
                var s = Rec1.OrderBy(
                    r => r,
                    Comparer<Record>.Create(
                        (x, y) =>
                        {
                            int c = x.Category.CompareTo(y.Category);
                            if (c != 0) return c;
                            c = x.Subcategory.CompareTo(y.Subcategory);
                            if (c != 0) return c;
                            return x.Event.CompareTo(y.Event);
                        }));

                // This query enumerates bottom-up (in the key hierarchy-sense), 
                // then proceedes to each higher summary (parent) level and retains
                // the "result" collection of its children determined by the preceding GroupBy.
                //
                // This is so each level can later step down into its own children for looping. 
                // And the leaf durations, immediate child counts and leaf event counts are already calculated as well.
                //
                // I think this is O(n), since each record does not get repeatedly scanned for different levels of the same accumulation stat.
                // But under-the-hood there may be much grainy processing like enumerator instantiation, depending on child count density.
                var q = s
                    .GroupBy(
                        r => new { Category = r.Category, Subcategory = r.Subcategory, Event = r.Event },
                        (key, result) => {
                            int c = result.Count();
                            return new
                            {
                                LowKey = key,
                                // at this lowest summary level only, 
                                // the hierarchical (immediate child) count is the same as the event (leaf) count
                                LowChildCount = c,
                                LowEventCount = c,
                                LowDuration = result.Sum(x => x.Duration),
                                LowChildren = result
                            };
                        })
                        .GroupBy(
                            r => new { Category = r.LowKey.Category, Subcategory = r.LowKey.Subcategory },
                                (key, result) => new {
                                    MidKey = key,
                                    MidChildCount = result.Count(),
                                    MidEventCount = result.Sum(x => x.LowEventCount),
                                    MidDuration = result.Sum(x => x.LowDuration),
                                    MidChildren = result
                                })
                                .GroupBy(
                                    r => new { Category = r.MidKey.Category },
                                    (key, result) => new {
                                        HighKey = key,
                                        HighChildCount = result.Count(),
                                        HighEventCount = result.Sum(x => x.MidEventCount),
                                        HighDuration = result.Sum(x => x.MidDuration),
                                        HighChildren = result
                                    });


                foreach (var high in q)
                {
                    Console.WriteLine($"{high.HighKey.Category} child#={high.HighChildCount} event#={high.HighEventCount} duration={high.HighDuration}");
                    foreach (var mid in high.HighChildren)
                    {
                        Console.WriteLine($"  {mid.MidKey.Subcategory} child#={mid.MidChildCount} event#={high.HighEventCount} duration={mid.MidDuration}");
                        foreach (var low in mid.MidChildren)
                        {
                            Console.WriteLine($"    {low.LowKey.Event} child#={low.LowChildCount} event#={high.HighEventCount} duration={low.LowDuration}");
                            foreach (var leaf in low.LowChildren)
                            {
                                Console.WriteLine($"      >> {leaf.Category}/{leaf.Subcategory}/{leaf.Event} duration={leaf.Duration}");
                            }
                        }
                    }
                }
            }
        }
    }
柬埔寨Mrinal

让我们从回顾Enumerable.GroupByMS源代码的实现开始

通过...分组

public static IEnumerable<TResult> GroupBy<TSource, TKey, TElement, TResult>
(this IEnumerable<TSource> source, 
 Func<TSource, TKey> keySelector, 
 Func<TSource, TElement> elementSelector, 
 Func<TKey, IEnumerable<TElement>, TResult> resultSelector)
{
  return new GroupedEnumerable<TSource, TKey, TElement, TResult>(source, 
                                                                 keySelector, 
                                                                 elementSelector, 
                                                                 resultSelector, null);
}

分组可枚举

internal class GroupedEnumerable<TSource, TKey, TElement, TResult> : IEnumerable<TResult>{
    IEnumerable<TSource> source;
    Func<TSource, TKey> keySelector;
    Func<TSource, TElement> elementSelector;
    IEqualityComparer<TKey> comparer;
    Func<TKey, IEnumerable<TElement>, TResult> resultSelector;

    public GroupedEnumerable(IEnumerable<TSource> source, Func<TSource, TKey> keySelector, Func<TSource, TElement> elementSelector, Func<TKey, IEnumerable<TElement>, TResult> resultSelector, IEqualityComparer<TKey> comparer){
        if (source == null) throw Error.ArgumentNull("source");
        if (keySelector == null) throw Error.ArgumentNull("keySelector");
        if (elementSelector == null) throw Error.ArgumentNull("elementSelector");
        if (resultSelector == null) throw Error.ArgumentNull("resultSelector");
        this.source = source;
        this.keySelector = keySelector;
        this.elementSelector = elementSelector;
        this.comparer = comparer;
        this.resultSelector = resultSelector;
    }

    public IEnumerator<TResult> GetEnumerator(){
        Lookup<TKey, TElement> lookup = Lookup<TKey, TElement>.Create<TSource>(source, keySelector, elementSelector, comparer);
        return lookup.ApplyResultSelector(resultSelector).GetEnumerator();
    }

    IEnumerator IEnumerable.GetEnumerator(){
        return GetEnumerator();
    }
}

重要的一点是它内部使用的LookUp数据结构,它提供了O(1)查找的重点,所以其内部通过的所有记录,每一个记录添加数据枚举LookUp,这类型的IEnumerable<IGrouping<TKey,TValue>>

现在您的具体问题

是GroupBy被证明既具有线性复杂性又具有稳定的顺序,或者仅仅是Microsoft .NET的隐式实现方式?我没有任何特别的怀疑,它可能是非线性的,只是出于好奇而问。

设计和代码建议在顶层使用O(N),但可以肯定的是,它取决于Element Selection,如果进一步执行O(N)操作,则会自动使复杂度O(N^2)增加

似乎实例化的枚举数应始终与层次结构中看到的父节点数成线性关系,对吗?

是的,它只是顶层的单个枚举数

在相同的聚合统计和级别期间,不会多次枚举叶子或非叶子,是吗?因为每个节点只是某个“结果”中一个父级的子级。

通过非叶节点,我假设您引用的是嵌套数据,所以这取决于您的模型设计,但是如果每个元素都按预期的方式分别拥有嵌套数据的副本,而不是共享/引用,那么我看不到为什么元素将被多次触摸,它是特定子/子数据的枚举

更多细节

  • 在您的情况下,您正在编写GroupBy查询,因此,一个查询的结果将作为另一查询的输入,就复杂性而言,它更像是O(N) + O(N) + O(N)O(N),但是在每个查询中,您至少要进行1个O(N)操作,因此,由于您的嵌套设计,整体复杂性O(N^2)不会O(N)
  • 如果您有深层嵌套的数据,则最好使用concatenated GroupBySelectMany它可以对数据进行展平,因为它对在展平的数据上进行最终聚合的操作较为简单

本文收集自互联网,转载请注明来源。

如有侵权,请联系 [email protected] 删除。

编辑于
0

我来说两句

0 条评论
登录 后参与评论

相关文章