Commit 
							
							·
						
						3bf9f70
	
1
								Parent(s):
							
							5b28618
								
Add verifyToken field to verify evaluation results are produced by Hugging Face's automatic model evaluator (#2)
Browse files- Add verifyToken field to verify evaluation results are produced by Hugging Face's automatic model evaluator (b3dc64e9033fee10e40afc70931d8687a456c6ca)
Co-authored-by: Evaluation Bot <[email protected]>
    	
        README.md
    CHANGED
    
    | @@ -1,10 +1,10 @@ | |
| 1 | 
             
            ---
         | 
| 2 | 
             
            language:
         | 
| 3 | 
             
            - en
         | 
|  | |
| 4 | 
             
            tags:
         | 
| 5 | 
             
            - summarization
         | 
| 6 | 
             
            - pegasus
         | 
| 7 | 
            -
            license: apache-2.0
         | 
| 8 | 
             
            datasets:
         | 
| 9 | 
             
            - kmfoda/booksum
         | 
| 10 | 
             
            metrics:
         | 
| @@ -23,39 +23,38 @@ widget: | |
| 23 | 
             
                deviation of the average recurrence interval, the more specific could be the long
         | 
| 24 | 
             
                term prediction of a future mainshock.
         | 
| 25 | 
             
              example_title: earthquakes
         | 
| 26 | 
            -
            - text:  | 
| 27 | 
            -
                 | 
| 28 | 
            -
                 | 
| 29 | 
            -
                 | 
| 30 | 
            -
                 | 
| 31 | 
            -
                 | 
| 32 | 
            -
                 | 
| 33 | 
            -
                 | 
| 34 | 
            -
                 | 
| 35 | 
            -
                 | 
| 36 | 
            -
                 | 
| 37 | 
            -
                 | 
| 38 | 
            -
                 | 
| 39 | 
            -
                 | 
| 40 | 
            -
                 | 
| 41 | 
            -
                 | 
| 42 | 
            -
                 | 
| 43 | 
            -
                 | 
| 44 | 
            -
                 | 
| 45 | 
            -
                 | 
| 46 | 
            -
                 | 
| 47 | 
            -
                 | 
| 48 | 
            -
                 | 
| 49 | 
            -
                 | 
| 50 | 
            -
                 | 
| 51 | 
            -
                 | 
| 52 | 
            -
                 | 
| 53 | 
            -
                 | 
| 54 | 
            -
                 | 
| 55 | 
            -
                 | 
| 56 | 
            -
                 | 
| 57 | 
            -
                 | 
| 58 | 
            -
                \ this function space (Section 5)."
         | 
| 59 | 
             
              example_title: scientific paper
         | 
| 60 | 
             
            - text: ' the big variety of data coming from diverse sources is one of the key properties
         | 
| 61 | 
             
                of the big data phenomenon. It is, therefore, beneficial to understand how data
         | 
| @@ -100,50 +99,62 @@ widget: | |
| 100 | 
             
                in their business An important area of data analytics on the edge of corporate
         | 
| 101 | 
             
                IT and the Internet is Web Analytics.'
         | 
| 102 | 
             
              example_title: data science textbook
         | 
| 103 | 
            -
            - text:  | 
| 104 | 
            -
                 | 
| 105 | 
            -
                 | 
| 106 | 
            -
                 | 
| 107 | 
            -
                 | 
| 108 | 
            -
                 | 
| 109 | 
            -
                 | 
| 110 | 
            -
             | 
| 111 | 
            -
                 | 
| 112 | 
            -
                 | 
| 113 | 
            -
                 | 
| 114 | 
            -
                 | 
| 115 | 
            -
                 | 
| 116 | 
            -
             | 
| 117 | 
            -
                 | 
| 118 | 
            -
                 | 
| 119 | 
            -
                 | 
| 120 | 
            -
                 | 
| 121 | 
            -
                 | 
| 122 | 
            -
                 | 
| 123 | 
            -
                 | 
| 124 | 
            -
                 | 
| 125 | 
            -
                 | 
| 126 | 
            -
                 | 
| 127 | 
            -
             | 
| 128 | 
            -
                 | 
| 129 | 
            -
                 | 
| 130 | 
            -
             | 
| 131 | 
            -
                 | 
| 132 | 
            -
                 | 
| 133 | 
            -
             | 
| 134 | 
            -
                 | 
| 135 | 
            -
                 | 
| 136 | 
            -
                 | 
| 137 | 
            -
                 | 
| 138 | 
            -
             | 
| 139 | 
            -
                 | 
| 140 | 
            -
                 | 
| 141 | 
            -
                 | 
| 142 | 
            -
                 | 
| 143 | 
            -
             | 
| 144 | 
            -
                 | 
| 145 | 
            -
                 | 
| 146 | 
            -
                 | 
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
| 147 | 
             
              example_title: bigbird blog intro
         | 
| 148 | 
             
            inference:
         | 
| 149 | 
             
              parameters:
         | 
| @@ -166,30 +177,36 @@ model-index: | |
| 166 | 
             
                  config: kmfoda--booksum
         | 
| 167 | 
             
                  split: test
         | 
| 168 | 
             
                metrics:
         | 
| 169 | 
            -
                -  | 
| 170 | 
            -
                  type: rouge
         | 
| 171 | 
             
                  value: 29.1023
         | 
|  | |
| 172 | 
             
                  verified: true
         | 
| 173 | 
            -
             | 
| 174 | 
            -
             | 
| 175 | 
             
                  value: 6.2441
         | 
|  | |
| 176 | 
             
                  verified: true
         | 
| 177 | 
            -
             | 
| 178 | 
            -
             | 
| 179 | 
             
                  value: 14.7503
         | 
|  | |
| 180 | 
             
                  verified: true
         | 
| 181 | 
            -
             | 
| 182 | 
            -
             | 
| 183 | 
             
                  value: 27.2375
         | 
|  | |
| 184 | 
             
                  verified: true
         | 
| 185 | 
            -
             | 
| 186 | 
            -
             | 
| 187 | 
             
                  value: 2.979011058807373
         | 
|  | |
| 188 | 
             
                  verified: true
         | 
| 189 | 
            -
             | 
| 190 | 
            -
             | 
| 191 | 
             
                  value: 467.269
         | 
|  | |
| 192 | 
             
                  verified: true
         | 
|  | |
| 193 | 
             
            ---
         | 
| 194 |  | 
| 195 |  | 
|  | |
| 1 | 
             
            ---
         | 
| 2 | 
             
            language:
         | 
| 3 | 
             
            - en
         | 
| 4 | 
            +
            license: apache-2.0
         | 
| 5 | 
             
            tags:
         | 
| 6 | 
             
            - summarization
         | 
| 7 | 
             
            - pegasus
         | 
|  | |
| 8 | 
             
            datasets:
         | 
| 9 | 
             
            - kmfoda/booksum
         | 
| 10 | 
             
            metrics:
         | 
|  | |
| 23 | 
             
                deviation of the average recurrence interval, the more specific could be the long
         | 
| 24 | 
             
                term prediction of a future mainshock.
         | 
| 25 | 
             
              example_title: earthquakes
         | 
| 26 | 
            +
            - text: ' A typical feed-forward neural field algorithm. Spatiotemporal coordinates
         | 
| 27 | 
            +
                are fed into a neural network that predicts values in the reconstructed domain.
         | 
| 28 | 
            +
                Then, this domain is mapped to the sensor domain where sensor measurements are
         | 
| 29 | 
            +
                available as supervision. Class and Section Problems Addressed Generalization
         | 
| 30 | 
            +
                (Section 2) Inverse problems, ill-posed problems, editability; symmetries. Hybrid
         | 
| 31 | 
            +
                Representations (Section 3) Computation & memory efficiency, representation capacity,
         | 
| 32 | 
            +
                editability: Forward Maps (Section 4) Inverse problems Network Architecture (Section
         | 
| 33 | 
            +
                5) Spectral bias, integration & derivatives. Manipulating Neural Fields (Section
         | 
| 34 | 
            +
                6) Edit ability, constraints, regularization. Table 2: The five classes of techniques
         | 
| 35 | 
            +
                in the neural field toolbox each addresses problems that arise in learning, inference,
         | 
| 36 | 
            +
                and control. (Section 3). We can supervise reconstruction via differentiable forward
         | 
| 37 | 
            +
                maps that transform Or project our domain (e.g, 3D reconstruction via 2D images;
         | 
| 38 | 
            +
                Section 4) With appropriate network architecture choices, we can overcome neural
         | 
| 39 | 
            +
                network spectral biases (blurriness) and efficiently compute derivatives and integrals
         | 
| 40 | 
            +
                (Section 5). Finally, we can manipulate neural fields to add constraints and regularizations,
         | 
| 41 | 
            +
                and to achieve editable representations (Section 6). Collectively, these classes
         | 
| 42 | 
            +
                constitute a ''toolbox'' of techniques to help solve problems with neural fields
         | 
| 43 | 
            +
                There are three components in a conditional neural field: (1) An encoder or inference
         | 
| 44 | 
            +
                function € that outputs the conditioning latent variable 2 given an observation
         | 
| 45 | 
            +
                0 E(0) =2. 2 is typically a low-dimensional vector, and is often referred to aS
         | 
| 46 | 
            +
                a latent code Or feature code_ (2) A mapping function 4 between Z and neural field
         | 
| 47 | 
            +
                parameters O: Y(z) = O; (3) The neural field itself $. The encoder € finds the
         | 
| 48 | 
            +
                most probable z given the observations O: argmaxz P(2/0). The decoder maximizes
         | 
| 49 | 
            +
                the inverse conditional probability to find the most probable 0 given Z: arg-
         | 
| 50 | 
            +
                max P(Olz). We discuss different encoding schemes with different optimality guarantees
         | 
| 51 | 
            +
                (Section 2.1.1), both global and local conditioning (Section 2.1.2), and different
         | 
| 52 | 
            +
                mapping functions Y (Section 2.1.3) 2. Generalization Suppose we wish to estimate
         | 
| 53 | 
            +
                a plausible 3D surface shape given a partial or noisy point cloud. We need a suitable
         | 
| 54 | 
            +
                prior over the sur- face in its reconstruction domain to generalize to the partial
         | 
| 55 | 
            +
                observations. A neural network expresses a prior via the function space of its
         | 
| 56 | 
            +
                architecture and parameters 0, and generalization is influenced by the inductive
         | 
| 57 | 
            +
                bias of this function space (Section 5).'
         | 
|  | |
| 58 | 
             
              example_title: scientific paper
         | 
| 59 | 
             
            - text: ' the big variety of data coming from diverse sources is one of the key properties
         | 
| 60 | 
             
                of the big data phenomenon. It is, therefore, beneficial to understand how data
         | 
|  | |
| 99 | 
             
                in their business An important area of data analytics on the edge of corporate
         | 
| 100 | 
             
                IT and the Internet is Web Analytics.'
         | 
| 101 | 
             
              example_title: data science textbook
         | 
| 102 | 
            +
            - text: 'Transformer-based models have shown to be very useful for many NLP tasks.
         | 
| 103 | 
            +
                However, a major limitation of transformers-based models is its O(n^2)O(n 2) time
         | 
| 104 | 
            +
                & memory complexity (where nn is sequence length). Hence, it''s computationally
         | 
| 105 | 
            +
                very expensive to apply transformer-based models on long sequences n > 512n>512.
         | 
| 106 | 
            +
                Several recent papers, e.g. Longformer, Performer, Reformer, Clustered attention
         | 
| 107 | 
            +
                try to remedy this problem by approximating the full attention matrix. You can
         | 
| 108 | 
            +
                checkout 🤗''s recent blog post in case you are unfamiliar with these models.
         | 
| 109 | 
            +
             | 
| 110 | 
            +
                BigBird (introduced in paper) is one of such recent models to address this issue.
         | 
| 111 | 
            +
                BigBird relies on block sparse attention instead of normal attention (i.e. BERT''s
         | 
| 112 | 
            +
                attention) and can handle sequences up to a length of 4096 at a much lower computational
         | 
| 113 | 
            +
                cost compared to BERT. It has achieved SOTA on various tasks involving very long
         | 
| 114 | 
            +
                sequences such as long documents summarization, question-answering with long contexts.
         | 
| 115 | 
            +
             | 
| 116 | 
            +
                BigBird RoBERTa-like model is now available in 🤗Transformers. The goal of this
         | 
| 117 | 
            +
                post is to give the reader an in-depth understanding of big bird implementation
         | 
| 118 | 
            +
                & ease one''s life in using BigBird with 🤗Transformers. But, before going into
         | 
| 119 | 
            +
                more depth, it is important to remember that the BigBird''s attention is an approximation
         | 
| 120 | 
            +
                of BERT''s full attention and therefore does not strive to be better than BERT''s
         | 
| 121 | 
            +
                full attention, but rather to be more efficient. It simply allows to apply transformer-based
         | 
| 122 | 
            +
                models to much longer sequences since BERT''s quadratic memory requirement quickly
         | 
| 123 | 
            +
                becomes unbearable. Simply put, if we would have ∞ compute & ∞ time, BERT''s attention
         | 
| 124 | 
            +
                would be preferred over block sparse attention (which we are going to discuss
         | 
| 125 | 
            +
                in this post).
         | 
| 126 | 
            +
             | 
| 127 | 
            +
                If you wonder why we need more compute when working with longer sequences, this
         | 
| 128 | 
            +
                blog post is just right for you!
         | 
| 129 | 
            +
             | 
| 130 | 
            +
                Some of the main questions one might have when working with standard BERT-like
         | 
| 131 | 
            +
                attention include:
         | 
| 132 | 
            +
             | 
| 133 | 
            +
                Do all tokens really have to attend to all other tokens? Why not compute attention
         | 
| 134 | 
            +
                only over important tokens? How to decide what tokens are important? How to attend
         | 
| 135 | 
            +
                to just a few tokens in a very efficient way? In this blog post, we will try to
         | 
| 136 | 
            +
                answer those questions.
         | 
| 137 | 
            +
             | 
| 138 | 
            +
                What tokens should be attended to? We will give a practical example of how attention
         | 
| 139 | 
            +
                works by considering the sentence ''BigBird is now available in HuggingFace for
         | 
| 140 | 
            +
                extractive question answering''. In BERT-like attention, every word would simply
         | 
| 141 | 
            +
                attend to all other tokens.
         | 
| 142 | 
            +
             | 
| 143 | 
            +
                Let''s think about a sensible choice of key tokens that a queried token actually
         | 
| 144 | 
            +
                only should attend to by writing some pseudo-code. Will will assume that the token
         | 
| 145 | 
            +
                available is queried and build a sensible list of key tokens to attend to.
         | 
| 146 | 
            +
             | 
| 147 | 
            +
                >>> # let''s consider following sentence as an example >>> example = [''BigBird'',
         | 
| 148 | 
            +
                ''is'', ''now'', ''available'', ''in'', ''HuggingFace'', ''for'', ''extractive'',
         | 
| 149 | 
            +
                ''question'', ''answering'']
         | 
| 150 | 
            +
             | 
| 151 | 
            +
                >>> # further let''s assume, we''re trying to understand the representation of
         | 
| 152 | 
            +
                ''available'' i.e. >>> query_token = ''available'' >>> # We will initialize an
         | 
| 153 | 
            +
                empty `set` and fill up the tokens of our interest as we proceed in this section.
         | 
| 154 | 
            +
                >>> key_tokens = [] # => currently ''available'' token doesn''t have anything
         | 
| 155 | 
            +
                to attend Nearby tokens should be important because, in a sentence (sequence of
         | 
| 156 | 
            +
                words), the current word is highly dependent on neighboring past & future tokens.
         | 
| 157 | 
            +
                This intuition is the idea behind the concept of sliding attention.'
         | 
| 158 | 
             
              example_title: bigbird blog intro
         | 
| 159 | 
             
            inference:
         | 
| 160 | 
             
              parameters:
         | 
|  | |
| 177 | 
             
                  config: kmfoda--booksum
         | 
| 178 | 
             
                  split: test
         | 
| 179 | 
             
                metrics:
         | 
| 180 | 
            +
                - type: rouge
         | 
|  | |
| 181 | 
             
                  value: 29.1023
         | 
| 182 | 
            +
                  name: ROUGE-1
         | 
| 183 | 
             
                  verified: true
         | 
| 184 | 
            +
                  verifyToken: eyJhbGciOiJFZERTQSIsInR5cCI6IkpXVCJ9.eyJoYXNoIjoiYTFhNjg4YTFlODU5MmVjNGVmNDRmMjQ4M2YyZGNmMWRlYjBhZmVhMTY3ZTUxNDkzNjY0OGVmNWJlNmY1OTkzNCIsInZlcnNpb24iOjF9.E_rVKqB7WEerLeRq6JIVTLZ1TgmsThFQJVKh11WH1qWa-cL3766psPWDKe8mK3lNkjmwbiDW0DZlDt4dm2ATCA
         | 
| 185 | 
            +
                - type: rouge
         | 
| 186 | 
             
                  value: 6.2441
         | 
| 187 | 
            +
                  name: ROUGE-2
         | 
| 188 | 
             
                  verified: true
         | 
| 189 | 
            +
                  verifyToken: eyJhbGciOiJFZERTQSIsInR5cCI6IkpXVCJ9.eyJoYXNoIjoiNDVmZmFlOTgwN2Q3ZWRkZGVkMzU1ZDRkYzU1MWMzMTk1NDM5YTU0MzFjNDljNmZlY2I2NjZmZjcyYjBkZGExZCIsInZlcnNpb24iOjF9.QnuGoMWX8cq5_ukRtiaLRLau_F9XiCjg313GC7Iu1VGK8Kj_9lzU43377VsH0fBWooA1zJjtIK0UA-YpGQQOAA
         | 
| 190 | 
            +
                - type: rouge
         | 
| 191 | 
             
                  value: 14.7503
         | 
| 192 | 
            +
                  name: ROUGE-L
         | 
| 193 | 
             
                  verified: true
         | 
| 194 | 
            +
                  verifyToken: eyJhbGciOiJFZERTQSIsInR5cCI6IkpXVCJ9.eyJoYXNoIjoiMzJhNzE0YjZiZWQ4NDE1Yjg3ZGJjY2ZmYWEwYzU5MTRhYWNiNTcyODU1NzM5NTZhNjNlNmYwNDVlYmZmYjkxOCIsInZlcnNpb24iOjF9.m5BLUMefXa1KivIIE9-gYKYq5aRRbfpQWazqzXxfCsqqp38Lt0ymk6OwXSlQyB_5oksNHIDFKpJX4wjYx2i7Bw
         | 
| 195 | 
            +
                - type: rouge
         | 
| 196 | 
             
                  value: 27.2375
         | 
| 197 | 
            +
                  name: ROUGE-LSUM
         | 
| 198 | 
             
                  verified: true
         | 
| 199 | 
            +
                  verifyToken: eyJhbGciOiJFZERTQSIsInR5cCI6IkpXVCJ9.eyJoYXNoIjoiMTY1OTIxMzBkMGJiZmNiNjZjYmQ2MjUwMjBkYTg5Zjc1NjVlZjllNTg0MDM1NTdhZDJlZmIwOTczOGNkZDc5YyIsInZlcnNpb24iOjF9.bThI16mvqhEuGBhdao0w8j03vv9G9Quy-ITRZzalr41zOour9it4oxEPFCvmPf-nLCQkqgWKUDEzgr6Ww8qgBg
         | 
| 200 | 
            +
                - type: loss
         | 
| 201 | 
             
                  value: 2.979011058807373
         | 
| 202 | 
            +
                  name: loss
         | 
| 203 | 
             
                  verified: true
         | 
| 204 | 
            +
                  verifyToken: eyJhbGciOiJFZERTQSIsInR5cCI6IkpXVCJ9.eyJoYXNoIjoiOGM0NzM3YTI4Njg4NDY0ZjQzNTZmYTIxYzcxNDBlNzAwNTAxNDE4MTZjYmZmNzYwODU0OWQ1ZjM5YjRmMmFkZiIsInZlcnNpb24iOjF9.EPEP53AoqHz0rjVGStJI2dM7ivxFmOj572I3llWdAoejm3zO1Iq5WDArYsqOse_oLxYCgcqPmNVc5IcLW9x7Dg
         | 
| 205 | 
            +
                - type: gen_len
         | 
| 206 | 
             
                  value: 467.269
         | 
| 207 | 
            +
                  name: gen_len
         | 
| 208 | 
             
                  verified: true
         | 
| 209 | 
            +
                  verifyToken: eyJhbGciOiJFZERTQSIsInR5cCI6IkpXVCJ9.eyJoYXNoIjoiNjgzYzU2ZjkwN2RhNzJlZmQyZTBlYmUxMTZhNzg0ODMwMjA3OTUzNTIwOWFkZWVmNjVmMTJiZmZhNWFmY2UzZCIsInZlcnNpb24iOjF9.RW5tzk2fcc_m4bgaSopRDFhSR9R8hRaYKrstXH4X5iGP_Xwvhy5Q7-igd2ACnlxIfmtdTmMxLMsvHr5oAZEwDg
         | 
| 210 | 
             
            ---
         | 
| 211 |  | 
| 212 |  | 

