giulio98 commited on
Commit
d7607a1
·
1 Parent(s): 57d6a6f

Updating module

Browse files
bleu.py ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2017 Google Inc. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+ # ==============================================================================
15
+
16
+ """Python implementation of BLEU and smooth-BLEU.
17
+
18
+ This module provides a Python implementation of BLEU and smooth-BLEU.
19
+ Smooth BLEU is computed following the method outlined in the paper:
20
+ Chin-Yew Lin, Franz Josef Och. ORANGE: a method for evaluating automatic
21
+ evaluation metrics for machine translation. COLING 2004.
22
+ """
23
+
24
+ import collections
25
+ import math
26
+
27
+
28
+ def _get_ngrams(segment, max_order):
29
+ """Extracts all n-grams upto a given maximum order from an input segment.
30
+
31
+ Args:
32
+ segment: text segment from which n-grams will be extracted.
33
+ max_order: maximum length in tokens of the n-grams returned by this
34
+ methods.
35
+
36
+ Returns:
37
+ The Counter containing all n-grams upto max_order in segment
38
+ with a count of how many times each n-gram occurred.
39
+ """
40
+ ngram_counts = collections.Counter()
41
+ for order in range(1, max_order + 1):
42
+ for i in range(0, len(segment) - order + 1):
43
+ ngram = tuple(segment[i:i+order])
44
+ ngram_counts[ngram] += 1
45
+ return ngram_counts
46
+
47
+
48
+ def compute_bleu(reference_corpus, translation_corpus, max_order=4,
49
+ smooth=False):
50
+ """Computes BLEU score of translated segments against one or more references.
51
+
52
+ Args:
53
+ reference_corpus: list of lists of references for each translation. Each
54
+ reference should be tokenized into a list of tokens.
55
+ translation_corpus: list of translations to score. Each translation
56
+ should be tokenized into a list of tokens.
57
+ max_order: Maximum n-gram order to use when computing BLEU score.
58
+ smooth: Whether or not to apply Lin et al. 2004 smoothing.
59
+
60
+ Returns:
61
+ 3-Tuple with the BLEU score, n-gram precisions, geometric mean of n-gram
62
+ precisions and brevity penalty.
63
+ """
64
+ matches_by_order = [0] * max_order
65
+ possible_matches_by_order = [0] * max_order
66
+ reference_length = 0
67
+ translation_length = 0
68
+ for (references, translation) in zip(reference_corpus,
69
+ translation_corpus):
70
+ reference_length += min(len(r) for r in references)
71
+ translation_length += len(translation)
72
+
73
+ merged_ref_ngram_counts = collections.Counter()
74
+ for reference in references:
75
+ merged_ref_ngram_counts |= _get_ngrams(reference, max_order)
76
+ translation_ngram_counts = _get_ngrams(translation, max_order)
77
+ overlap = translation_ngram_counts & merged_ref_ngram_counts
78
+ for ngram in overlap:
79
+ matches_by_order[len(ngram)-1] += overlap[ngram]
80
+ for order in range(1, max_order+1):
81
+ possible_matches = len(translation) - order + 1
82
+ if possible_matches > 0:
83
+ possible_matches_by_order[order-1] += possible_matches
84
+
85
+ precisions = [0] * max_order
86
+ for i in range(0, max_order):
87
+ if smooth:
88
+ precisions[i] = ((matches_by_order[i] + 1.) /
89
+ (possible_matches_by_order[i] + 1.))
90
+ else:
91
+ if possible_matches_by_order[i] > 0:
92
+ precisions[i] = (float(matches_by_order[i]) /
93
+ possible_matches_by_order[i])
94
+ else:
95
+ precisions[i] = 0.0
96
+
97
+ if min(precisions) > 0:
98
+ p_log_sum = sum((1. / max_order) * math.log(p) for p in precisions)
99
+ geo_mean = math.exp(p_log_sum)
100
+ else:
101
+ geo_mean = 0
102
+
103
+ ratio = float(translation_length) / reference_length
104
+
105
+ if ratio > 1.0:
106
+ bp = 1.
107
+ else:
108
+ bp = math.exp(1 - 1. / ratio)
109
+
110
+ bleu = geo_mean * bp
111
+
112
+ return (bleu, precisions, bp, ratio, translation_length, reference_length)
113
+
114
+
115
+ def _bleu(ref_file, trans_file, subword_option=None):
116
+ max_order = 4
117
+ smooth = True
118
+ ref_files = [ref_file]
119
+ reference_text = []
120
+ for reference_filename in ref_files:
121
+ with open(reference_filename) as fh:
122
+ reference_text.append(fh.readlines())
123
+ per_segment_references = []
124
+ for references in zip(*reference_text):
125
+ reference_list = []
126
+ for reference in references:
127
+ reference_list.append(reference.strip().split())
128
+ per_segment_references.append(reference_list)
129
+ translations = []
130
+ with open(trans_file) as fh:
131
+ for line in fh:
132
+ translations.append(line.strip().split())
133
+ bleu_score, _, _, _, _, _ = compute_bleu(per_segment_references, translations, max_order, smooth)
134
+ return round(100 * bleu_score,2)
codebleu.py CHANGED
@@ -15,6 +15,11 @@
15
 
16
  import evaluate
17
  import datasets
 
 
 
 
 
18
 
19
 
20
  # TODO: Add BibTeX citation
@@ -82,14 +87,83 @@ class CodeBLEU(evaluate.Metric):
82
  )
83
 
84
  def _download_and_prepare(self, dl_manager):
85
- """Optional: download external resources useful to compute the scores"""
86
- # TODO: Download external resources if needed
87
- pass
88
-
89
- def _compute(self, predictions, references):
90
- """Returns the scores"""
91
- # TODO: Compute the different scores of the module
92
- accuracy = sum(i == j for i, j in zip(predictions, references)) / len(predictions)
93
- return {
94
- "accuracy": accuracy,
95
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
  import evaluate
17
  import datasets
18
+ import bleu
19
+ import weighted_ngram_match
20
+ import syntax_match
21
+ import dataflow_match
22
+ from tree_sitter import Language, Parser
23
 
24
 
25
  # TODO: Add BibTeX citation
 
87
  )
88
 
89
  def _download_and_prepare(self, dl_manager):
90
+ """Optional: download external resources useful to compute the scores"""
91
+ # TODO: Download external resources if needed
92
+ if self.config_name == "python":
93
+ Language.build_library('./parser/my-languages.so',['tree-sitter-python'])
94
+ elif self.config_name == "go":
95
+ Language.build_library('./parser/my-languages.so',['tree-sitter-go'])
96
+ elif self.config_name == "javascript":
97
+ Language.build_library('./parser/my-languages.so',['tree-sitter-javascript'])
98
+ elif self.config_name == "php":
99
+ Language.build_library('./parser/my-languages.so',['tree-sitter-php'])
100
+ elif self.config_name == "java":
101
+ Language.build_library('./parser/my-languages.so',['tree-sitter-java'])
102
+ elif self.config_name == "ruby":
103
+ Language.build_library('./parser/my-languages.so',['tree-sitter-ruby'])
104
+ elif self.config_name == "c-sharp":
105
+ Language.build_library('./parser/my-languages.so',['tree-sitter-c-sharp'])
106
+ elif self.config_name == "cpp":
107
+ Language.build_library('./parser/my-languages.so',['tree-sitter-cpp'])
108
+
109
+
110
+ )
111
+
112
+ def _compute(self, predictions, references, language, alpha=0.25, beta=0.25, gamma=0.25, theta=0.25):
113
+
114
+ # preprocess inputs
115
+ pre_references = [[x.strip() for x in open(file, 'r', encoding='utf-8').readlines()] \
116
+ for file in references]
117
+ hypothesis = [x.strip() for x in open(predictions, 'r', encoding='utf-8').readlines()]
118
+
119
+ for i in range(len(pre_references)):
120
+ assert len(hypothesis) == len(pre_references[i])
121
+
122
+ references = []
123
+ for i in range(len(hypothesis)):
124
+ ref_for_instance = []
125
+ for j in range(len(pre_references)):
126
+ ref_for_instance.append(pre_references[j][i])
127
+ references.append(ref_for_instance)
128
+ assert len(references) == len(pre_references)*len(hypothesis)
129
+
130
+
131
+ # calculate ngram match (BLEU)
132
+ tokenized_hyps = [x.split() for x in hypothesis]
133
+ tokenized_refs = [[x.split() for x in reference] for reference in references]
134
+
135
+ ngram_match_score = bleu.corpus_bleu(tokenized_refs,tokenized_hyps)
136
+
137
+ # calculate weighted ngram match
138
+ # from os import listdir
139
+ # from os.path import isfile, join
140
+ # onlyfiles = [f for f in listdir("./keywords") if isfile(join("keywords", f))]
141
+ # print(onlyfiles)
142
+ keywords = [x.strip() for x in open('./keywords/'+ language +'.txt', 'r', encoding='utf-8').readlines()]
143
+ def make_weights(reference_tokens, key_word_list):
144
+ return {token:1 if token in key_word_list else 0.2 \
145
+ for token in reference_tokens}
146
+ tokenized_refs_with_weights = [[[reference_tokens, make_weights(reference_tokens, keywords)]\
147
+ for reference_tokens in reference] for reference in tokenized_refs]
148
+
149
+ weighted_ngram_match_score = weighted_ngram_match.corpus_bleu(tokenized_refs_with_weights,tokenized_hyps)
150
+
151
+ # calculate syntax match
152
+ syntax_match_score = syntax_match.corpus_syntax_match(references, hypothesis, language)
153
+
154
+ # calculate dataflow match
155
+ dataflow_match_score = dataflow_match.corpus_dataflow_match(references, hypothesis, language)
156
+
157
+
158
+
159
+ code_bleu_score = alpha*ngram_match_score\
160
+ + beta*weighted_ngram_match_score\
161
+ + gamma*syntax_match_score\
162
+ + theta*dataflow_match_score
163
+ return {
164
+ "ngram_match_score": ngram_match_score,
165
+ "weighted_ngram_match_score": weighted_ngram_match_score,
166
+ "syntax_match_score": syntax_match_score,
167
+ "dataflow_match_score": dataflow_match_score,
168
+ "code_bleu_score": code_bleu_score
169
+ }
dataflow_match.py ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Microsoft Corporation.
2
+ # Licensed under the MIT license.
3
+
4
+ from parser import DFG_python,DFG_java,DFG_ruby,DFG_go,DFG_php,DFG_javascript,DFG_csharp
5
+ from parser import (remove_comments_and_docstrings,
6
+ tree_to_token_index,
7
+ index_to_code_token,
8
+ tree_to_variable_index)
9
+ from tree_sitter import Language, Parser
10
+ import pdb
11
+
12
+ dfg_function={
13
+ 'python':DFG_python,
14
+ 'java':DFG_java,
15
+ 'ruby':DFG_ruby,
16
+ 'go':DFG_go,
17
+ 'php':DFG_php,
18
+ 'javascript':DFG_javascript,
19
+ 'c_sharp':DFG_csharp,
20
+ 'c':DFG_csharp,
21
+ 'cpp':DFG_csharp
22
+ }
23
+
24
+ def calc_dataflow_match(references, candidate, lang):
25
+ return corpus_dataflow_match([references], [candidate], lang)
26
+
27
+ def corpus_dataflow_match(references, candidates, lang):
28
+ LANGUAGE = Language('parser/my-languages.so', lang)
29
+ parser = Parser()
30
+ parser.set_language(LANGUAGE)
31
+ parser = [parser,dfg_function[lang]]
32
+ match_count = 0
33
+ total_count = 0
34
+
35
+ for i in range(len(candidates)):
36
+ references_sample = references[i]
37
+ candidate = candidates[i]
38
+ for reference in references_sample:
39
+ try:
40
+ candidate=remove_comments_and_docstrings(candidate,'java')
41
+ except:
42
+ pass
43
+ try:
44
+ reference=remove_comments_and_docstrings(reference,'java')
45
+ except:
46
+ pass
47
+
48
+ cand_dfg = get_data_flow(candidate, parser)
49
+ ref_dfg = get_data_flow(reference, parser)
50
+
51
+ normalized_cand_dfg = normalize_dataflow(cand_dfg)
52
+ normalized_ref_dfg = normalize_dataflow(ref_dfg)
53
+
54
+ if len(normalized_ref_dfg) > 0:
55
+ total_count += len(normalized_ref_dfg)
56
+ for dataflow in normalized_ref_dfg:
57
+ if dataflow in normalized_cand_dfg:
58
+ match_count += 1
59
+ normalized_cand_dfg.remove(dataflow)
60
+ if total_count == 0:
61
+ print("WARNING: There is no reference data-flows extracted from the whole corpus, and the data-flow match score degenerates to 0. Please consider ignoring this score.")
62
+ return 0
63
+ score = match_count / total_count
64
+ return score
65
+
66
+ def get_data_flow(code, parser):
67
+ try:
68
+ tree = parser[0].parse(bytes(code,'utf8'))
69
+ root_node = tree.root_node
70
+ tokens_index=tree_to_token_index(root_node)
71
+ code=code.split('\n')
72
+ code_tokens=[index_to_code_token(x,code) for x in tokens_index]
73
+ index_to_code={}
74
+ for idx,(index,code) in enumerate(zip(tokens_index,code_tokens)):
75
+ index_to_code[index]=(idx,code)
76
+ try:
77
+ DFG,_=parser[1](root_node,index_to_code,{})
78
+ except:
79
+ DFG=[]
80
+ DFG=sorted(DFG,key=lambda x:x[1])
81
+ indexs=set()
82
+ for d in DFG:
83
+ if len(d[-1])!=0:
84
+ indexs.add(d[1])
85
+ for x in d[-1]:
86
+ indexs.add(x)
87
+ new_DFG=[]
88
+ for d in DFG:
89
+ if d[1] in indexs:
90
+ new_DFG.append(d)
91
+ codes=code_tokens
92
+ dfg=new_DFG
93
+ except:
94
+ codes=code.split()
95
+ dfg=[]
96
+ #merge nodes
97
+ dic={}
98
+ for d in dfg:
99
+ if d[1] not in dic:
100
+ dic[d[1]]=d
101
+ else:
102
+ dic[d[1]]=(d[0],d[1],d[2],list(set(dic[d[1]][3]+d[3])),list(set(dic[d[1]][4]+d[4])))
103
+ DFG=[]
104
+ for d in dic:
105
+ DFG.append(dic[d])
106
+ dfg=DFG
107
+ return dfg
108
+
109
+ def normalize_dataflow_item(dataflow_item):
110
+ var_name = dataflow_item[0]
111
+ var_pos = dataflow_item[1]
112
+ relationship = dataflow_item[2]
113
+ par_vars_name_list = dataflow_item[3]
114
+ par_vars_pos_list = dataflow_item[4]
115
+
116
+ var_names = list(set(par_vars_name_list+[var_name]))
117
+ norm_names = {}
118
+ for i in range(len(var_names)):
119
+ norm_names[var_names[i]] = 'var_'+str(i)
120
+
121
+ norm_var_name = norm_names[var_name]
122
+ relationship = dataflow_item[2]
123
+ norm_par_vars_name_list = [norm_names[x] for x in par_vars_name_list]
124
+
125
+ return (norm_var_name, relationship, norm_par_vars_name_list)
126
+
127
+ def normalize_dataflow(dataflow):
128
+ var_dict = {}
129
+ i = 0
130
+ normalized_dataflow = []
131
+ for item in dataflow:
132
+ var_name = item[0]
133
+ relationship = item[2]
134
+ par_vars_name_list = item[3]
135
+ for name in par_vars_name_list:
136
+ if name not in var_dict:
137
+ var_dict[name] = 'var_'+str(i)
138
+ i += 1
139
+ if var_name not in var_dict:
140
+ var_dict[var_name] = 'var_'+str(i)
141
+ i+= 1
142
+ normalized_dataflow.append((var_dict[var_name], relationship, [var_dict[x] for x in par_vars_name_list]))
143
+ return normalized_dataflow
144
+
keywords/c.txt ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ auto
2
+ else
3
+ long
4
+ switch
5
+ break
6
+ enum
7
+ register
8
+ typedef
9
+ case
10
+ extern
11
+ return
12
+ union
13
+ char
14
+ float
15
+ short
16
+ unsigned
17
+ const
18
+ for
19
+ signed
20
+ void
21
+ continue
22
+ goto
23
+ sizeof
24
+ volatile
25
+ default
26
+ if
27
+ static
28
+ while
29
+ do
30
+ int
31
+ struct
32
+ _Packed
33
+ double
keywords/c_sharp.txt ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ abstract
2
+ as
3
+ base
4
+ bool
5
+ break
6
+ byte
7
+ case
8
+ catch
9
+ char
10
+ checked
11
+ class
12
+ const
13
+ continue
14
+ decimal
15
+ default
16
+ delegate
17
+ do
18
+ double
19
+ else
20
+ enum
21
+ event
22
+ explicit
23
+ extern
24
+ false
25
+ finally
26
+ fixed
27
+ float
28
+ for
29
+ foreach
30
+ goto
31
+ if
32
+ implicit
33
+ in
34
+ int
35
+ interface
36
+ internal
37
+ is
38
+ lock
39
+ long
40
+ namespace
41
+ new
42
+ null
43
+ object
44
+ operator
45
+ out
46
+ override
47
+ params
48
+ private
49
+ protected
50
+ public
51
+ readonly
52
+ ref
53
+ return
54
+ sbyte
55
+ sealed
56
+ short
57
+ sizeof
58
+ stackalloc
59
+ static
60
+ string
61
+ struct
62
+ switch
63
+ this
64
+ throw
65
+ true
66
+ try
67
+ typeof
68
+ uint
69
+ ulong
70
+ unchecked
71
+ unsafe
72
+ ushort
73
+ using
74
+ virtual
75
+ void
76
+ volatile
77
+ while
78
+ add
79
+ alias
80
+ ascending
81
+ async
82
+ await
83
+ by
84
+ descending
85
+ dynamic
86
+ equals
87
+ from
88
+ get
89
+ global
90
+ group
91
+ into
92
+ join
93
+ let
94
+ nameof
95
+ notnull
96
+ on
97
+ orderby
98
+ partial
99
+ remove
100
+ select
101
+ set
102
+ unmanaged
103
+ value
104
+ var
105
+ when
106
+ where
107
+ yield
keywords/cpp.txt ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ auto
2
+ const
3
+ double
4
+ float
5
+ int
6
+ short
7
+ struct
8
+ unsigned
9
+ break
10
+ continue
11
+ else
12
+ for
13
+ long
14
+ signed
15
+ switch
16
+ void
17
+ case
18
+ default
19
+ enum
20
+ goto
21
+ register
22
+ sizeof
23
+ typedef
24
+ volatile
25
+ char
26
+ do
27
+ extern
28
+ if
29
+ return
30
+ static
31
+ union
32
+ while
33
+ asm
34
+ dynamic_cast
35
+ namespace
36
+ reinterpret_cast
37
+ bool
38
+ explicit
39
+ new
40
+ static_cast
41
+ typeid
42
+ catch
43
+ false
44
+ try
45
+ operator
46
+ template
47
+ typename
48
+ class
49
+ friend
50
+ private
51
+ this
52
+ using
53
+ const_cast
54
+ inline
55
+ public
56
+ throw
57
+ virtual
58
+ delete
59
+ mutable
60
+ protected
61
+ true
62
+ wchar_t
63
+ and
64
+ bitand
65
+ compl
66
+ not_eq
67
+ or_eq
68
+ xor_eq
69
+ and_eq
70
+ bitor
71
+ not
72
+ or
73
+ xor
74
+ cin
75
+ endl
76
+ INT_MIN
77
+ iomanip
78
+ main
79
+ npos
80
+ std
81
+ cout
82
+ include
83
+ INT_MAX
84
+ iostream
85
+ MAX_RAND
86
+ NULL
87
+ string
keywords/java.txt ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ abstract
2
+ assert
3
+ boolean
4
+ break
5
+ byte
6
+ case
7
+ catch
8
+ char
9
+ class
10
+ const
11
+ continue
12
+ default
13
+ do
14
+ double
15
+ else
16
+ enum
17
+ extends
18
+ final
19
+ finally
20
+ float
21
+ for
22
+ goto
23
+ if
24
+ implements
25
+ import
26
+ instanceof
27
+ int
28
+ interface
29
+ long
30
+ native
31
+ new
32
+ package
33
+ private
34
+ protected
35
+ public
36
+ return
37
+ short
38
+ static
39
+ strictfp
40
+ super
41
+ switch
42
+ synchronized
43
+ this
44
+ throw
45
+ throws
46
+ transient
47
+ try
48
+ void
49
+ volatile
50
+ while
keywords/javascript.txt ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ await
2
+ break
3
+ case
4
+ catch
5
+ class
6
+ const
7
+ continue
8
+ debugger
9
+ default
10
+ delete
11
+ do
12
+ else
13
+ enum
14
+ export
15
+ extends
16
+ false
17
+ finally
18
+ for
19
+ function
20
+ if
21
+ implements
22
+ import
23
+ in
24
+ instanceof
25
+ interface
26
+ let
27
+ new
28
+ null
29
+ package
30
+ private
31
+ protected
32
+ public
33
+ return
34
+ super
35
+ switch
36
+ static
37
+ this
38
+ throw
39
+ try
40
+ True
41
+ typeof
42
+ var
43
+ void
44
+ while
45
+ with
46
+ yield
keywords/php.txt ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ __halt_compiler
2
+ abstract
3
+ and
4
+ array
5
+ as
6
+ break
7
+ callable
8
+ case
9
+ catch
10
+ class
11
+ clone
12
+ const
13
+ continue
14
+ declare
15
+ default
16
+ die
17
+ do
18
+ echo
19
+ else
20
+ elseif
21
+ empty
22
+ enddeclare
23
+ endfor
24
+ endforeach
25
+ endif
26
+ endswitch
27
+ endwhile
28
+ eval
29
+ exit
30
+ extends
31
+ final
32
+ for
33
+ foreach
34
+ function
35
+ global
36
+ goto
37
+ if
38
+ implements
39
+ include
40
+ include_once
41
+ instanceof
42
+ insteadof
43
+ interface
44
+ isset
45
+ list
46
+ namespace
47
+ new
48
+ or
49
+ print
50
+ private
51
+ protected
52
+ public
53
+ require
54
+ require_once
55
+ return
56
+ static
57
+ switch
58
+ throw
59
+ trait
60
+ try
61
+ unset
62
+ use
63
+ var
64
+ while
65
+ xor
keywords/python.txt ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ False
2
+ None
3
+ True
4
+ and
5
+ as
6
+ assert
7
+ async
8
+ await
9
+ break
10
+ class
11
+ continue
12
+ def
13
+ del
14
+ elif
15
+ else
16
+ except
17
+ finally
18
+ for
19
+ from
20
+ global
21
+ if
22
+ import
23
+ in
24
+ is
25
+ lambda
26
+ nonlocal
27
+ not
28
+ or
29
+ pass
30
+ raise
31
+ return
32
+ try
33
+ while
34
+ with
35
+ yield
parser/DFG.py ADDED
@@ -0,0 +1,1184 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Microsoft Corporation.
2
+ # Licensed under the MIT license.
3
+
4
+ from tree_sitter import Language, Parser
5
+ from .utils import (remove_comments_and_docstrings,
6
+ tree_to_token_index,
7
+ index_to_code_token,
8
+ tree_to_variable_index)
9
+
10
+
11
+ def DFG_python(root_node,index_to_code,states):
12
+ assignment=['assignment','augmented_assignment','for_in_clause']
13
+ if_statement=['if_statement']
14
+ for_statement=['for_statement']
15
+ while_statement=['while_statement']
16
+ do_first_statement=['for_in_clause']
17
+ def_statement=['default_parameter']
18
+ states=states.copy()
19
+ if (len(root_node.children)==0 or root_node.type in ['string_literal','string','character_literal']) and root_node.type!='comment':
20
+ idx,code=index_to_code[(root_node.start_point,root_node.end_point)]
21
+ if root_node.type==code:
22
+ return [],states
23
+ elif code in states:
24
+ return [(code,idx,'comesFrom',[code],states[code].copy())],states
25
+ else:
26
+ if root_node.type=='identifier':
27
+ states[code]=[idx]
28
+ return [(code,idx,'comesFrom',[],[])],states
29
+ elif root_node.type in def_statement:
30
+ name=root_node.child_by_field_name('name')
31
+ value=root_node.child_by_field_name('value')
32
+ DFG=[]
33
+ if value is None:
34
+ indexs=tree_to_variable_index(name,index_to_code)
35
+ for index in indexs:
36
+ idx,code=index_to_code[index]
37
+ DFG.append((code,idx,'comesFrom',[],[]))
38
+ states[code]=[idx]
39
+ return sorted(DFG,key=lambda x:x[1]),states
40
+ else:
41
+ name_indexs=tree_to_variable_index(name,index_to_code)
42
+ value_indexs=tree_to_variable_index(value,index_to_code)
43
+ temp,states=DFG_python(value,index_to_code,states)
44
+ DFG+=temp
45
+ for index1 in name_indexs:
46
+ idx1,code1=index_to_code[index1]
47
+ for index2 in value_indexs:
48
+ idx2,code2=index_to_code[index2]
49
+ DFG.append((code1,idx1,'comesFrom',[code2],[idx2]))
50
+ states[code1]=[idx1]
51
+ return sorted(DFG,key=lambda x:x[1]),states
52
+ elif root_node.type in assignment:
53
+ if root_node.type=='for_in_clause':
54
+ right_nodes=[root_node.children[-1]]
55
+ left_nodes=[root_node.child_by_field_name('left')]
56
+ else:
57
+ if root_node.child_by_field_name('right') is None:
58
+ return [],states
59
+ left_nodes=[x for x in root_node.child_by_field_name('left').children if x.type!=',']
60
+ right_nodes=[x for x in root_node.child_by_field_name('right').children if x.type!=',']
61
+ if len(right_nodes)!=len(left_nodes):
62
+ left_nodes=[root_node.child_by_field_name('left')]
63
+ right_nodes=[root_node.child_by_field_name('right')]
64
+ if len(left_nodes)==0:
65
+ left_nodes=[root_node.child_by_field_name('left')]
66
+ if len(right_nodes)==0:
67
+ right_nodes=[root_node.child_by_field_name('right')]
68
+ DFG=[]
69
+ for node in right_nodes:
70
+ temp,states=DFG_python(node,index_to_code,states)
71
+ DFG+=temp
72
+
73
+ for left_node,right_node in zip(left_nodes,right_nodes):
74
+ left_tokens_index=tree_to_variable_index(left_node,index_to_code)
75
+ right_tokens_index=tree_to_variable_index(right_node,index_to_code)
76
+ temp=[]
77
+ for token1_index in left_tokens_index:
78
+ idx1,code1=index_to_code[token1_index]
79
+ temp.append((code1,idx1,'computedFrom',[index_to_code[x][1] for x in right_tokens_index],
80
+ [index_to_code[x][0] for x in right_tokens_index]))
81
+ states[code1]=[idx1]
82
+ DFG+=temp
83
+ return sorted(DFG,key=lambda x:x[1]),states
84
+ elif root_node.type in if_statement:
85
+ DFG=[]
86
+ current_states=states.copy()
87
+ others_states=[]
88
+ tag=False
89
+ if 'else' in root_node.type:
90
+ tag=True
91
+ for child in root_node.children:
92
+ if 'else' in child.type:
93
+ tag=True
94
+ if child.type not in ['elif_clause','else_clause']:
95
+ temp,current_states=DFG_python(child,index_to_code,current_states)
96
+ DFG+=temp
97
+ else:
98
+ temp,new_states=DFG_python(child,index_to_code,states)
99
+ DFG+=temp
100
+ others_states.append(new_states)
101
+ others_states.append(current_states)
102
+ if tag is False:
103
+ others_states.append(states)
104
+ new_states={}
105
+ for dic in others_states:
106
+ for key in dic:
107
+ if key not in new_states:
108
+ new_states[key]=dic[key].copy()
109
+ else:
110
+ new_states[key]+=dic[key]
111
+ for key in new_states:
112
+ new_states[key]=sorted(list(set(new_states[key])))
113
+ return sorted(DFG,key=lambda x:x[1]),new_states
114
+ elif root_node.type in for_statement:
115
+ DFG=[]
116
+ for i in range(2):
117
+ right_nodes=[x for x in root_node.child_by_field_name('right').children if x.type!=',']
118
+ left_nodes=[x for x in root_node.child_by_field_name('left').children if x.type!=',']
119
+ if len(right_nodes)!=len(left_nodes):
120
+ left_nodes=[root_node.child_by_field_name('left')]
121
+ right_nodes=[root_node.child_by_field_name('right')]
122
+ if len(left_nodes)==0:
123
+ left_nodes=[root_node.child_by_field_name('left')]
124
+ if len(right_nodes)==0:
125
+ right_nodes=[root_node.child_by_field_name('right')]
126
+ for node in right_nodes:
127
+ temp,states=DFG_python(node,index_to_code,states)
128
+ DFG+=temp
129
+ for left_node,right_node in zip(left_nodes,right_nodes):
130
+ left_tokens_index=tree_to_variable_index(left_node,index_to_code)
131
+ right_tokens_index=tree_to_variable_index(right_node,index_to_code)
132
+ temp=[]
133
+ for token1_index in left_tokens_index:
134
+ idx1,code1=index_to_code[token1_index]
135
+ temp.append((code1,idx1,'computedFrom',[index_to_code[x][1] for x in right_tokens_index],
136
+ [index_to_code[x][0] for x in right_tokens_index]))
137
+ states[code1]=[idx1]
138
+ DFG+=temp
139
+ if root_node.children[-1].type=="block":
140
+ temp,states=DFG_python(root_node.children[-1],index_to_code,states)
141
+ DFG+=temp
142
+ dic={}
143
+ for x in DFG:
144
+ if (x[0],x[1],x[2]) not in dic:
145
+ dic[(x[0],x[1],x[2])]=[x[3],x[4]]
146
+ else:
147
+ dic[(x[0],x[1],x[2])][0]=list(set(dic[(x[0],x[1],x[2])][0]+x[3]))
148
+ dic[(x[0],x[1],x[2])][1]=sorted(list(set(dic[(x[0],x[1],x[2])][1]+x[4])))
149
+ DFG=[(x[0],x[1],x[2],y[0],y[1]) for x,y in sorted(dic.items(),key=lambda t:t[0][1])]
150
+ return sorted(DFG,key=lambda x:x[1]),states
151
+ elif root_node.type in while_statement:
152
+ DFG=[]
153
+ for i in range(2):
154
+ for child in root_node.children:
155
+ temp,states=DFG_python(child,index_to_code,states)
156
+ DFG+=temp
157
+ dic={}
158
+ for x in DFG:
159
+ if (x[0],x[1],x[2]) not in dic:
160
+ dic[(x[0],x[1],x[2])]=[x[3],x[4]]
161
+ else:
162
+ dic[(x[0],x[1],x[2])][0]=list(set(dic[(x[0],x[1],x[2])][0]+x[3]))
163
+ dic[(x[0],x[1],x[2])][1]=sorted(list(set(dic[(x[0],x[1],x[2])][1]+x[4])))
164
+ DFG=[(x[0],x[1],x[2],y[0],y[1]) for x,y in sorted(dic.items(),key=lambda t:t[0][1])]
165
+ return sorted(DFG,key=lambda x:x[1]),states
166
+ else:
167
+ DFG=[]
168
+ for child in root_node.children:
169
+ if child.type in do_first_statement:
170
+ temp,states=DFG_python(child,index_to_code,states)
171
+ DFG+=temp
172
+ for child in root_node.children:
173
+ if child.type not in do_first_statement:
174
+ temp,states=DFG_python(child,index_to_code,states)
175
+ DFG+=temp
176
+
177
+ return sorted(DFG,key=lambda x:x[1]),states
178
+
179
+
180
+ def DFG_java(root_node,index_to_code,states):
181
+ assignment=['assignment_expression']
182
+ def_statement=['variable_declarator']
183
+ increment_statement=['update_expression']
184
+ if_statement=['if_statement','else']
185
+ for_statement=['for_statement']
186
+ enhanced_for_statement=['enhanced_for_statement']
187
+ while_statement=['while_statement']
188
+ do_first_statement=[]
189
+ states=states.copy()
190
+ if (len(root_node.children)==0 or root_node.type in ['string_literal','string','character_literal']) and root_node.type!='comment':
191
+ idx,code=index_to_code[(root_node.start_point,root_node.end_point)]
192
+ if root_node.type==code:
193
+ return [],states
194
+ elif code in states:
195
+ return [(code,idx,'comesFrom',[code],states[code].copy())],states
196
+ else:
197
+ if root_node.type=='identifier':
198
+ states[code]=[idx]
199
+ return [(code,idx,'comesFrom',[],[])],states
200
+ elif root_node.type in def_statement:
201
+ name=root_node.child_by_field_name('name')
202
+ value=root_node.child_by_field_name('value')
203
+ DFG=[]
204
+ if value is None:
205
+ indexs=tree_to_variable_index(name,index_to_code)
206
+ for index in indexs:
207
+ idx,code=index_to_code[index]
208
+ DFG.append((code,idx,'comesFrom',[],[]))
209
+ states[code]=[idx]
210
+ return sorted(DFG,key=lambda x:x[1]),states
211
+ else:
212
+ name_indexs=tree_to_variable_index(name,index_to_code)
213
+ value_indexs=tree_to_variable_index(value,index_to_code)
214
+ temp,states=DFG_java(value,index_to_code,states)
215
+ DFG+=temp
216
+ for index1 in name_indexs:
217
+ idx1,code1=index_to_code[index1]
218
+ for index2 in value_indexs:
219
+ idx2,code2=index_to_code[index2]
220
+ DFG.append((code1,idx1,'comesFrom',[code2],[idx2]))
221
+ states[code1]=[idx1]
222
+ return sorted(DFG,key=lambda x:x[1]),states
223
+ elif root_node.type in assignment:
224
+ left_nodes=root_node.child_by_field_name('left')
225
+ right_nodes=root_node.child_by_field_name('right')
226
+ DFG=[]
227
+ temp,states=DFG_java(right_nodes,index_to_code,states)
228
+ DFG+=temp
229
+ name_indexs=tree_to_variable_index(left_nodes,index_to_code)
230
+ value_indexs=tree_to_variable_index(right_nodes,index_to_code)
231
+ for index1 in name_indexs:
232
+ idx1,code1=index_to_code[index1]
233
+ for index2 in value_indexs:
234
+ idx2,code2=index_to_code[index2]
235
+ DFG.append((code1,idx1,'computedFrom',[code2],[idx2]))
236
+ states[code1]=[idx1]
237
+ return sorted(DFG,key=lambda x:x[1]),states
238
+ elif root_node.type in increment_statement:
239
+ DFG=[]
240
+ indexs=tree_to_variable_index(root_node,index_to_code)
241
+ for index1 in indexs:
242
+ idx1,code1=index_to_code[index1]
243
+ for index2 in indexs:
244
+ idx2,code2=index_to_code[index2]
245
+ DFG.append((code1,idx1,'computedFrom',[code2],[idx2]))
246
+ states[code1]=[idx1]
247
+ return sorted(DFG,key=lambda x:x[1]),states
248
+ elif root_node.type in if_statement:
249
+ DFG=[]
250
+ current_states=states.copy()
251
+ others_states=[]
252
+ flag=False
253
+ tag=False
254
+ if 'else' in root_node.type:
255
+ tag=True
256
+ for child in root_node.children:
257
+ if 'else' in child.type:
258
+ tag=True
259
+ if child.type not in if_statement and flag is False:
260
+ temp,current_states=DFG_java(child,index_to_code,current_states)
261
+ DFG+=temp
262
+ else:
263
+ flag=True
264
+ temp,new_states=DFG_java(child,index_to_code,states)
265
+ DFG+=temp
266
+ others_states.append(new_states)
267
+ others_states.append(current_states)
268
+ if tag is False:
269
+ others_states.append(states)
270
+ new_states={}
271
+ for dic in others_states:
272
+ for key in dic:
273
+ if key not in new_states:
274
+ new_states[key]=dic[key].copy()
275
+ else:
276
+ new_states[key]+=dic[key]
277
+ for key in new_states:
278
+ new_states[key]=sorted(list(set(new_states[key])))
279
+ return sorted(DFG,key=lambda x:x[1]),new_states
280
+ elif root_node.type in for_statement:
281
+ DFG=[]
282
+ for child in root_node.children:
283
+ temp,states=DFG_java(child,index_to_code,states)
284
+ DFG+=temp
285
+ flag=False
286
+ for child in root_node.children:
287
+ if flag:
288
+ temp,states=DFG_java(child,index_to_code,states)
289
+ DFG+=temp
290
+ elif child.type=="local_variable_declaration":
291
+ flag=True
292
+ dic={}
293
+ for x in DFG:
294
+ if (x[0],x[1],x[2]) not in dic:
295
+ dic[(x[0],x[1],x[2])]=[x[3],x[4]]
296
+ else:
297
+ dic[(x[0],x[1],x[2])][0]=list(set(dic[(x[0],x[1],x[2])][0]+x[3]))
298
+ dic[(x[0],x[1],x[2])][1]=sorted(list(set(dic[(x[0],x[1],x[2])][1]+x[4])))
299
+ DFG=[(x[0],x[1],x[2],y[0],y[1]) for x,y in sorted(dic.items(),key=lambda t:t[0][1])]
300
+ return sorted(DFG,key=lambda x:x[1]),states
301
+ elif root_node.type in enhanced_for_statement:
302
+ name=root_node.child_by_field_name('name')
303
+ value=root_node.child_by_field_name('value')
304
+ body=root_node.child_by_field_name('body')
305
+ DFG=[]
306
+ for i in range(2):
307
+ temp,states=DFG_java(value,index_to_code,states)
308
+ DFG+=temp
309
+ name_indexs=tree_to_variable_index(name,index_to_code)
310
+ value_indexs=tree_to_variable_index(value,index_to_code)
311
+ for index1 in name_indexs:
312
+ idx1,code1=index_to_code[index1]
313
+ for index2 in value_indexs:
314
+ idx2,code2=index_to_code[index2]
315
+ DFG.append((code1,idx1,'computedFrom',[code2],[idx2]))
316
+ states[code1]=[idx1]
317
+ temp,states=DFG_java(body,index_to_code,states)
318
+ DFG+=temp
319
+ dic={}
320
+ for x in DFG:
321
+ if (x[0],x[1],x[2]) not in dic:
322
+ dic[(x[0],x[1],x[2])]=[x[3],x[4]]
323
+ else:
324
+ dic[(x[0],x[1],x[2])][0]=list(set(dic[(x[0],x[1],x[2])][0]+x[3]))
325
+ dic[(x[0],x[1],x[2])][1]=sorted(list(set(dic[(x[0],x[1],x[2])][1]+x[4])))
326
+ DFG=[(x[0],x[1],x[2],y[0],y[1]) for x,y in sorted(dic.items(),key=lambda t:t[0][1])]
327
+ return sorted(DFG,key=lambda x:x[1]),states
328
+ elif root_node.type in while_statement:
329
+ DFG=[]
330
+ for i in range(2):
331
+ for child in root_node.children:
332
+ temp,states=DFG_java(child,index_to_code,states)
333
+ DFG+=temp
334
+ dic={}
335
+ for x in DFG:
336
+ if (x[0],x[1],x[2]) not in dic:
337
+ dic[(x[0],x[1],x[2])]=[x[3],x[4]]
338
+ else:
339
+ dic[(x[0],x[1],x[2])][0]=list(set(dic[(x[0],x[1],x[2])][0]+x[3]))
340
+ dic[(x[0],x[1],x[2])][1]=sorted(list(set(dic[(x[0],x[1],x[2])][1]+x[4])))
341
+ DFG=[(x[0],x[1],x[2],y[0],y[1]) for x,y in sorted(dic.items(),key=lambda t:t[0][1])]
342
+ return sorted(DFG,key=lambda x:x[1]),states
343
+ else:
344
+ DFG=[]
345
+ for child in root_node.children:
346
+ if child.type in do_first_statement:
347
+ temp,states=DFG_java(child,index_to_code,states)
348
+ DFG+=temp
349
+ for child in root_node.children:
350
+ if child.type not in do_first_statement:
351
+ temp,states=DFG_java(child,index_to_code,states)
352
+ DFG+=temp
353
+
354
+ return sorted(DFG,key=lambda x:x[1]),states
355
+
356
+ def DFG_csharp(root_node,index_to_code,states):
357
+ assignment=['assignment_expression']
358
+ def_statement=['variable_declarator']
359
+ increment_statement=['postfix_unary_expression']
360
+ if_statement=['if_statement','else']
361
+ for_statement=['for_statement']
362
+ enhanced_for_statement=['for_each_statement']
363
+ while_statement=['while_statement']
364
+ do_first_statement=[]
365
+ states=states.copy()
366
+ if (len(root_node.children)==0 or root_node.type in ['string_literal','string','character_literal']) and root_node.type!='comment':
367
+ idx,code=index_to_code[(root_node.start_point,root_node.end_point)]
368
+ if root_node.type==code:
369
+ return [],states
370
+ elif code in states:
371
+ return [(code,idx,'comesFrom',[code],states[code].copy())],states
372
+ else:
373
+ if root_node.type=='identifier':
374
+ states[code]=[idx]
375
+ return [(code,idx,'comesFrom',[],[])],states
376
+ elif root_node.type in def_statement:
377
+ if len(root_node.children)==2:
378
+ name=root_node.children[0]
379
+ value=root_node.children[1]
380
+ else:
381
+ name=root_node.children[0]
382
+ value=None
383
+ DFG=[]
384
+ if value is None:
385
+ indexs=tree_to_variable_index(name,index_to_code)
386
+ for index in indexs:
387
+ idx,code=index_to_code[index]
388
+ DFG.append((code,idx,'comesFrom',[],[]))
389
+ states[code]=[idx]
390
+ return sorted(DFG,key=lambda x:x[1]),states
391
+ else:
392
+ name_indexs=tree_to_variable_index(name,index_to_code)
393
+ value_indexs=tree_to_variable_index(value,index_to_code)
394
+ temp,states=DFG_csharp(value,index_to_code,states)
395
+ DFG+=temp
396
+ for index1 in name_indexs:
397
+ idx1,code1=index_to_code[index1]
398
+ for index2 in value_indexs:
399
+ idx2,code2=index_to_code[index2]
400
+ DFG.append((code1,idx1,'comesFrom',[code2],[idx2]))
401
+ states[code1]=[idx1]
402
+ return sorted(DFG,key=lambda x:x[1]),states
403
+ elif root_node.type in assignment:
404
+ left_nodes=root_node.child_by_field_name('left')
405
+ right_nodes=root_node.child_by_field_name('right')
406
+ DFG=[]
407
+ temp,states=DFG_csharp(right_nodes,index_to_code,states)
408
+ DFG+=temp
409
+ name_indexs=tree_to_variable_index(left_nodes,index_to_code)
410
+ value_indexs=tree_to_variable_index(right_nodes,index_to_code)
411
+ for index1 in name_indexs:
412
+ idx1,code1=index_to_code[index1]
413
+ for index2 in value_indexs:
414
+ idx2,code2=index_to_code[index2]
415
+ DFG.append((code1,idx1,'computedFrom',[code2],[idx2]))
416
+ states[code1]=[idx1]
417
+ return sorted(DFG,key=lambda x:x[1]),states
418
+ elif root_node.type in increment_statement:
419
+ DFG=[]
420
+ indexs=tree_to_variable_index(root_node,index_to_code)
421
+ for index1 in indexs:
422
+ idx1,code1=index_to_code[index1]
423
+ for index2 in indexs:
424
+ idx2,code2=index_to_code[index2]
425
+ DFG.append((code1,idx1,'computedFrom',[code2],[idx2]))
426
+ states[code1]=[idx1]
427
+ return sorted(DFG,key=lambda x:x[1]),states
428
+ elif root_node.type in if_statement:
429
+ DFG=[]
430
+ current_states=states.copy()
431
+ others_states=[]
432
+ flag=False
433
+ tag=False
434
+ if 'else' in root_node.type:
435
+ tag=True
436
+ for child in root_node.children:
437
+ if 'else' in child.type:
438
+ tag=True
439
+ if child.type not in if_statement and flag is False:
440
+ temp,current_states=DFG_csharp(child,index_to_code,current_states)
441
+ DFG+=temp
442
+ else:
443
+ flag=True
444
+ temp,new_states=DFG_csharp(child,index_to_code,states)
445
+ DFG+=temp
446
+ others_states.append(new_states)
447
+ others_states.append(current_states)
448
+ if tag is False:
449
+ others_states.append(states)
450
+ new_states={}
451
+ for dic in others_states:
452
+ for key in dic:
453
+ if key not in new_states:
454
+ new_states[key]=dic[key].copy()
455
+ else:
456
+ new_states[key]+=dic[key]
457
+ for key in new_states:
458
+ new_states[key]=sorted(list(set(new_states[key])))
459
+ return sorted(DFG,key=lambda x:x[1]),new_states
460
+ elif root_node.type in for_statement:
461
+ DFG=[]
462
+ for child in root_node.children:
463
+ temp,states=DFG_csharp(child,index_to_code,states)
464
+ DFG+=temp
465
+ flag=False
466
+ for child in root_node.children:
467
+ if flag:
468
+ temp,states=DFG_csharp(child,index_to_code,states)
469
+ DFG+=temp
470
+ elif child.type=="local_variable_declaration":
471
+ flag=True
472
+ dic={}
473
+ for x in DFG:
474
+ if (x[0],x[1],x[2]) not in dic:
475
+ dic[(x[0],x[1],x[2])]=[x[3],x[4]]
476
+ else:
477
+ dic[(x[0],x[1],x[2])][0]=list(set(dic[(x[0],x[1],x[2])][0]+x[3]))
478
+ dic[(x[0],x[1],x[2])][1]=sorted(list(set(dic[(x[0],x[1],x[2])][1]+x[4])))
479
+ DFG=[(x[0],x[1],x[2],y[0],y[1]) for x,y in sorted(dic.items(),key=lambda t:t[0][1])]
480
+ return sorted(DFG,key=lambda x:x[1]),states
481
+ elif root_node.type in enhanced_for_statement:
482
+ name=root_node.child_by_field_name('left')
483
+ value=root_node.child_by_field_name('right')
484
+ body=root_node.child_by_field_name('body')
485
+ DFG=[]
486
+ for i in range(2):
487
+ temp,states=DFG_csharp(value,index_to_code,states)
488
+ DFG+=temp
489
+ name_indexs=tree_to_variable_index(name,index_to_code)
490
+ value_indexs=tree_to_variable_index(value,index_to_code)
491
+ for index1 in name_indexs:
492
+ idx1,code1=index_to_code[index1]
493
+ for index2 in value_indexs:
494
+ idx2,code2=index_to_code[index2]
495
+ DFG.append((code1,idx1,'computedFrom',[code2],[idx2]))
496
+ states[code1]=[idx1]
497
+ temp,states=DFG_csharp(body,index_to_code,states)
498
+ DFG+=temp
499
+ dic={}
500
+ for x in DFG:
501
+ if (x[0],x[1],x[2]) not in dic:
502
+ dic[(x[0],x[1],x[2])]=[x[3],x[4]]
503
+ else:
504
+ dic[(x[0],x[1],x[2])][0]=list(set(dic[(x[0],x[1],x[2])][0]+x[3]))
505
+ dic[(x[0],x[1],x[2])][1]=sorted(list(set(dic[(x[0],x[1],x[2])][1]+x[4])))
506
+ DFG=[(x[0],x[1],x[2],y[0],y[1]) for x,y in sorted(dic.items(),key=lambda t:t[0][1])]
507
+ return sorted(DFG,key=lambda x:x[1]),states
508
+ elif root_node.type in while_statement:
509
+ DFG=[]
510
+ for i in range(2):
511
+ for child in root_node.children:
512
+ temp,states=DFG_csharp(child,index_to_code,states)
513
+ DFG+=temp
514
+ dic={}
515
+ for x in DFG:
516
+ if (x[0],x[1],x[2]) not in dic:
517
+ dic[(x[0],x[1],x[2])]=[x[3],x[4]]
518
+ else:
519
+ dic[(x[0],x[1],x[2])][0]=list(set(dic[(x[0],x[1],x[2])][0]+x[3]))
520
+ dic[(x[0],x[1],x[2])][1]=sorted(list(set(dic[(x[0],x[1],x[2])][1]+x[4])))
521
+ DFG=[(x[0],x[1],x[2],y[0],y[1]) for x,y in sorted(dic.items(),key=lambda t:t[0][1])]
522
+ return sorted(DFG,key=lambda x:x[1]),states
523
+ else:
524
+ DFG=[]
525
+ for child in root_node.children:
526
+ if child.type in do_first_statement:
527
+ temp,states=DFG_csharp(child,index_to_code,states)
528
+ DFG+=temp
529
+ for child in root_node.children:
530
+ if child.type not in do_first_statement:
531
+ temp,states=DFG_csharp(child,index_to_code,states)
532
+ DFG+=temp
533
+
534
+ return sorted(DFG,key=lambda x:x[1]),states
535
+
536
+
537
+
538
+
539
+ def DFG_ruby(root_node,index_to_code,states):
540
+ assignment=['assignment','operator_assignment']
541
+ if_statement=['if','elsif','else','unless','when']
542
+ for_statement=['for']
543
+ while_statement=['while_modifier','until']
544
+ do_first_statement=[]
545
+ def_statement=['keyword_parameter']
546
+ if (len(root_node.children)==0 or root_node.type in ['string_literal','string','character_literal']) and root_node.type!='comment':
547
+ states=states.copy()
548
+ idx,code=index_to_code[(root_node.start_point,root_node.end_point)]
549
+ if root_node.type==code:
550
+ return [],states
551
+ elif code in states:
552
+ return [(code,idx,'comesFrom',[code],states[code].copy())],states
553
+ else:
554
+ if root_node.type=='identifier':
555
+ states[code]=[idx]
556
+ return [(code,idx,'comesFrom',[],[])],states
557
+ elif root_node.type in def_statement:
558
+ name=root_node.child_by_field_name('name')
559
+ value=root_node.child_by_field_name('value')
560
+ DFG=[]
561
+ if value is None:
562
+ indexs=tree_to_variable_index(name,index_to_code)
563
+ for index in indexs:
564
+ idx,code=index_to_code[index]
565
+ DFG.append((code,idx,'comesFrom',[],[]))
566
+ states[code]=[idx]
567
+ return sorted(DFG,key=lambda x:x[1]),states
568
+ else:
569
+ name_indexs=tree_to_variable_index(name,index_to_code)
570
+ value_indexs=tree_to_variable_index(value,index_to_code)
571
+ temp,states=DFG_ruby(value,index_to_code,states)
572
+ DFG+=temp
573
+ for index1 in name_indexs:
574
+ idx1,code1=index_to_code[index1]
575
+ for index2 in value_indexs:
576
+ idx2,code2=index_to_code[index2]
577
+ DFG.append((code1,idx1,'comesFrom',[code2],[idx2]))
578
+ states[code1]=[idx1]
579
+ return sorted(DFG,key=lambda x:x[1]),states
580
+ elif root_node.type in assignment:
581
+ left_nodes=[x for x in root_node.child_by_field_name('left').children if x.type!=',']
582
+ right_nodes=[x for x in root_node.child_by_field_name('right').children if x.type!=',']
583
+ if len(right_nodes)!=len(left_nodes):
584
+ left_nodes=[root_node.child_by_field_name('left')]
585
+ right_nodes=[root_node.child_by_field_name('right')]
586
+ if len(left_nodes)==0:
587
+ left_nodes=[root_node.child_by_field_name('left')]
588
+ if len(right_nodes)==0:
589
+ right_nodes=[root_node.child_by_field_name('right')]
590
+ if root_node.type=="operator_assignment":
591
+ left_nodes=[root_node.children[0]]
592
+ right_nodes=[root_node.children[-1]]
593
+
594
+ DFG=[]
595
+ for node in right_nodes:
596
+ temp,states=DFG_ruby(node,index_to_code,states)
597
+ DFG+=temp
598
+
599
+ for left_node,right_node in zip(left_nodes,right_nodes):
600
+ left_tokens_index=tree_to_variable_index(left_node,index_to_code)
601
+ right_tokens_index=tree_to_variable_index(right_node,index_to_code)
602
+ temp=[]
603
+ for token1_index in left_tokens_index:
604
+ idx1,code1=index_to_code[token1_index]
605
+ temp.append((code1,idx1,'computedFrom',[index_to_code[x][1] for x in right_tokens_index],
606
+ [index_to_code[x][0] for x in right_tokens_index]))
607
+ states[code1]=[idx1]
608
+ DFG+=temp
609
+ return sorted(DFG,key=lambda x:x[1]),states
610
+ elif root_node.type in if_statement:
611
+ DFG=[]
612
+ current_states=states.copy()
613
+ others_states=[]
614
+ tag=False
615
+ if 'else' in root_node.type:
616
+ tag=True
617
+ for child in root_node.children:
618
+ if 'else' in child.type:
619
+ tag=True
620
+ if child.type not in if_statement:
621
+ temp,current_states=DFG_ruby(child,index_to_code,current_states)
622
+ DFG+=temp
623
+ else:
624
+ temp,new_states=DFG_ruby(child,index_to_code,states)
625
+ DFG+=temp
626
+ others_states.append(new_states)
627
+ others_states.append(current_states)
628
+ if tag is False:
629
+ others_states.append(states)
630
+ new_states={}
631
+ for dic in others_states:
632
+ for key in dic:
633
+ if key not in new_states:
634
+ new_states[key]=dic[key].copy()
635
+ else:
636
+ new_states[key]+=dic[key]
637
+ for key in new_states:
638
+ new_states[key]=sorted(list(set(new_states[key])))
639
+ return sorted(DFG,key=lambda x:x[1]),new_states
640
+ elif root_node.type in for_statement:
641
+ DFG=[]
642
+ for i in range(2):
643
+ left_nodes=[root_node.child_by_field_name('pattern')]
644
+ right_nodes=[root_node.child_by_field_name('value')]
645
+ assert len(right_nodes)==len(left_nodes)
646
+ for node in right_nodes:
647
+ temp,states=DFG_ruby(node,index_to_code,states)
648
+ DFG+=temp
649
+ for left_node,right_node in zip(left_nodes,right_nodes):
650
+ left_tokens_index=tree_to_variable_index(left_node,index_to_code)
651
+ right_tokens_index=tree_to_variable_index(right_node,index_to_code)
652
+ temp=[]
653
+ for token1_index in left_tokens_index:
654
+ idx1,code1=index_to_code[token1_index]
655
+ temp.append((code1,idx1,'computedFrom',[index_to_code[x][1] for x in right_tokens_index],
656
+ [index_to_code[x][0] for x in right_tokens_index]))
657
+ states[code1]=[idx1]
658
+ DFG+=temp
659
+ temp,states=DFG_ruby(root_node.child_by_field_name('body'),index_to_code,states)
660
+ DFG+=temp
661
+ dic={}
662
+ for x in DFG:
663
+ if (x[0],x[1],x[2]) not in dic:
664
+ dic[(x[0],x[1],x[2])]=[x[3],x[4]]
665
+ else:
666
+ dic[(x[0],x[1],x[2])][0]=list(set(dic[(x[0],x[1],x[2])][0]+x[3]))
667
+ dic[(x[0],x[1],x[2])][1]=sorted(list(set(dic[(x[0],x[1],x[2])][1]+x[4])))
668
+ DFG=[(x[0],x[1],x[2],y[0],y[1]) for x,y in sorted(dic.items(),key=lambda t:t[0][1])]
669
+ return sorted(DFG,key=lambda x:x[1]),states
670
+ elif root_node.type in while_statement:
671
+ DFG=[]
672
+ for i in range(2):
673
+ for child in root_node.children:
674
+ temp,states=DFG_ruby(child,index_to_code,states)
675
+ DFG+=temp
676
+ dic={}
677
+ for x in DFG:
678
+ if (x[0],x[1],x[2]) not in dic:
679
+ dic[(x[0],x[1],x[2])]=[x[3],x[4]]
680
+ else:
681
+ dic[(x[0],x[1],x[2])][0]=list(set(dic[(x[0],x[1],x[2])][0]+x[3]))
682
+ dic[(x[0],x[1],x[2])][1]=sorted(list(set(dic[(x[0],x[1],x[2])][1]+x[4])))
683
+ DFG=[(x[0],x[1],x[2],y[0],y[1]) for x,y in sorted(dic.items(),key=lambda t:t[0][1])]
684
+ return sorted(DFG,key=lambda x:x[1]),states
685
+ else:
686
+ DFG=[]
687
+ for child in root_node.children:
688
+ if child.type in do_first_statement:
689
+ temp,states=DFG_ruby(child,index_to_code,states)
690
+ DFG+=temp
691
+ for child in root_node.children:
692
+ if child.type not in do_first_statement:
693
+ temp,states=DFG_ruby(child,index_to_code,states)
694
+ DFG+=temp
695
+
696
+ return sorted(DFG,key=lambda x:x[1]),states
697
+
698
+ def DFG_go(root_node,index_to_code,states):
699
+ assignment=['assignment_statement',]
700
+ def_statement=['var_spec']
701
+ increment_statement=['inc_statement']
702
+ if_statement=['if_statement','else']
703
+ for_statement=['for_statement']
704
+ enhanced_for_statement=[]
705
+ while_statement=[]
706
+ do_first_statement=[]
707
+ states=states.copy()
708
+ if (len(root_node.children)==0 or root_node.type in ['string_literal','string','character_literal']) and root_node.type!='comment':
709
+ idx,code=index_to_code[(root_node.start_point,root_node.end_point)]
710
+ if root_node.type==code:
711
+ return [],states
712
+ elif code in states:
713
+ return [(code,idx,'comesFrom',[code],states[code].copy())],states
714
+ else:
715
+ if root_node.type=='identifier':
716
+ states[code]=[idx]
717
+ return [(code,idx,'comesFrom',[],[])],states
718
+ elif root_node.type in def_statement:
719
+ name=root_node.child_by_field_name('name')
720
+ value=root_node.child_by_field_name('value')
721
+ DFG=[]
722
+ if value is None:
723
+ indexs=tree_to_variable_index(name,index_to_code)
724
+ for index in indexs:
725
+ idx,code=index_to_code[index]
726
+ DFG.append((code,idx,'comesFrom',[],[]))
727
+ states[code]=[idx]
728
+ return sorted(DFG,key=lambda x:x[1]),states
729
+ else:
730
+ name_indexs=tree_to_variable_index(name,index_to_code)
731
+ value_indexs=tree_to_variable_index(value,index_to_code)
732
+ temp,states=DFG_go(value,index_to_code,states)
733
+ DFG+=temp
734
+ for index1 in name_indexs:
735
+ idx1,code1=index_to_code[index1]
736
+ for index2 in value_indexs:
737
+ idx2,code2=index_to_code[index2]
738
+ DFG.append((code1,idx1,'comesFrom',[code2],[idx2]))
739
+ states[code1]=[idx1]
740
+ return sorted(DFG,key=lambda x:x[1]),states
741
+ elif root_node.type in assignment:
742
+ left_nodes=root_node.child_by_field_name('left')
743
+ right_nodes=root_node.child_by_field_name('right')
744
+ DFG=[]
745
+ temp,states=DFG_go(right_nodes,index_to_code,states)
746
+ DFG+=temp
747
+ name_indexs=tree_to_variable_index(left_nodes,index_to_code)
748
+ value_indexs=tree_to_variable_index(right_nodes,index_to_code)
749
+ for index1 in name_indexs:
750
+ idx1,code1=index_to_code[index1]
751
+ for index2 in value_indexs:
752
+ idx2,code2=index_to_code[index2]
753
+ DFG.append((code1,idx1,'computedFrom',[code2],[idx2]))
754
+ states[code1]=[idx1]
755
+ return sorted(DFG,key=lambda x:x[1]),states
756
+ elif root_node.type in increment_statement:
757
+ DFG=[]
758
+ indexs=tree_to_variable_index(root_node,index_to_code)
759
+ for index1 in indexs:
760
+ idx1,code1=index_to_code[index1]
761
+ for index2 in indexs:
762
+ idx2,code2=index_to_code[index2]
763
+ DFG.append((code1,idx1,'computedFrom',[code2],[idx2]))
764
+ states[code1]=[idx1]
765
+ return sorted(DFG,key=lambda x:x[1]),states
766
+ elif root_node.type in if_statement:
767
+ DFG=[]
768
+ current_states=states.copy()
769
+ others_states=[]
770
+ flag=False
771
+ tag=False
772
+ if 'else' in root_node.type:
773
+ tag=True
774
+ for child in root_node.children:
775
+ if 'else' in child.type:
776
+ tag=True
777
+ if child.type not in if_statement and flag is False:
778
+ temp,current_states=DFG_go(child,index_to_code,current_states)
779
+ DFG+=temp
780
+ else:
781
+ flag=True
782
+ temp,new_states=DFG_go(child,index_to_code,states)
783
+ DFG+=temp
784
+ others_states.append(new_states)
785
+ others_states.append(current_states)
786
+ if tag is False:
787
+ others_states.append(states)
788
+ new_states={}
789
+ for dic in others_states:
790
+ for key in dic:
791
+ if key not in new_states:
792
+ new_states[key]=dic[key].copy()
793
+ else:
794
+ new_states[key]+=dic[key]
795
+ for key in states:
796
+ if key not in new_states:
797
+ new_states[key]=states[key]
798
+ else:
799
+ new_states[key]+=states[key]
800
+ for key in new_states:
801
+ new_states[key]=sorted(list(set(new_states[key])))
802
+ return sorted(DFG,key=lambda x:x[1]),new_states
803
+ elif root_node.type in for_statement:
804
+ DFG=[]
805
+ for child in root_node.children:
806
+ temp,states=DFG_go(child,index_to_code,states)
807
+ DFG+=temp
808
+ flag=False
809
+ for child in root_node.children:
810
+ if flag:
811
+ temp,states=DFG_go(child,index_to_code,states)
812
+ DFG+=temp
813
+ elif child.type=="for_clause":
814
+ if child.child_by_field_name('update') is not None:
815
+ temp,states=DFG_go(child.child_by_field_name('update'),index_to_code,states)
816
+ DFG+=temp
817
+ flag=True
818
+ dic={}
819
+ for x in DFG:
820
+ if (x[0],x[1],x[2]) not in dic:
821
+ dic[(x[0],x[1],x[2])]=[x[3],x[4]]
822
+ else:
823
+ dic[(x[0],x[1],x[2])][0]=list(set(dic[(x[0],x[1],x[2])][0]+x[3]))
824
+ dic[(x[0],x[1],x[2])][1]=sorted(list(set(dic[(x[0],x[1],x[2])][1]+x[4])))
825
+ DFG=[(x[0],x[1],x[2],y[0],y[1]) for x,y in sorted(dic.items(),key=lambda t:t[0][1])]
826
+ return sorted(DFG,key=lambda x:x[1]),states
827
+ else:
828
+ DFG=[]
829
+ for child in root_node.children:
830
+ if child.type in do_first_statement:
831
+ temp,states=DFG_go(child,index_to_code,states)
832
+ DFG+=temp
833
+ for child in root_node.children:
834
+ if child.type not in do_first_statement:
835
+ temp,states=DFG_go(child,index_to_code,states)
836
+ DFG+=temp
837
+
838
+ return sorted(DFG,key=lambda x:x[1]),states
839
+
840
+
841
+
842
+
843
+ def DFG_php(root_node,index_to_code,states):
844
+ assignment=['assignment_expression','augmented_assignment_expression']
845
+ def_statement=['simple_parameter']
846
+ increment_statement=['update_expression']
847
+ if_statement=['if_statement','else_clause']
848
+ for_statement=['for_statement']
849
+ enhanced_for_statement=['foreach_statement']
850
+ while_statement=['while_statement']
851
+ do_first_statement=[]
852
+ states=states.copy()
853
+ if (len(root_node.children)==0 or root_node.type in ['string_literal','string','character_literal']) and root_node.type!='comment':
854
+ idx,code=index_to_code[(root_node.start_point,root_node.end_point)]
855
+ if root_node.type==code:
856
+ return [],states
857
+ elif code in states:
858
+ return [(code,idx,'comesFrom',[code],states[code].copy())],states
859
+ else:
860
+ if root_node.type=='identifier':
861
+ states[code]=[idx]
862
+ return [(code,idx,'comesFrom',[],[])],states
863
+ elif root_node.type in def_statement:
864
+ name=root_node.child_by_field_name('name')
865
+ value=root_node.child_by_field_name('default_value')
866
+ DFG=[]
867
+ if value is None:
868
+ indexs=tree_to_variable_index(name,index_to_code)
869
+ for index in indexs:
870
+ idx,code=index_to_code[index]
871
+ DFG.append((code,idx,'comesFrom',[],[]))
872
+ states[code]=[idx]
873
+ return sorted(DFG,key=lambda x:x[1]),states
874
+ else:
875
+ name_indexs=tree_to_variable_index(name,index_to_code)
876
+ value_indexs=tree_to_variable_index(value,index_to_code)
877
+ temp,states=DFG_php(value,index_to_code,states)
878
+ DFG+=temp
879
+ for index1 in name_indexs:
880
+ idx1,code1=index_to_code[index1]
881
+ for index2 in value_indexs:
882
+ idx2,code2=index_to_code[index2]
883
+ DFG.append((code1,idx1,'comesFrom',[code2],[idx2]))
884
+ states[code1]=[idx1]
885
+ return sorted(DFG,key=lambda x:x[1]),states
886
+ elif root_node.type in assignment:
887
+ left_nodes=root_node.child_by_field_name('left')
888
+ right_nodes=root_node.child_by_field_name('right')
889
+ DFG=[]
890
+ temp,states=DFG_php(right_nodes,index_to_code,states)
891
+ DFG+=temp
892
+ name_indexs=tree_to_variable_index(left_nodes,index_to_code)
893
+ value_indexs=tree_to_variable_index(right_nodes,index_to_code)
894
+ for index1 in name_indexs:
895
+ idx1,code1=index_to_code[index1]
896
+ for index2 in value_indexs:
897
+ idx2,code2=index_to_code[index2]
898
+ DFG.append((code1,idx1,'computedFrom',[code2],[idx2]))
899
+ states[code1]=[idx1]
900
+ return sorted(DFG,key=lambda x:x[1]),states
901
+ elif root_node.type in increment_statement:
902
+ DFG=[]
903
+ indexs=tree_to_variable_index(root_node,index_to_code)
904
+ for index1 in indexs:
905
+ idx1,code1=index_to_code[index1]
906
+ for index2 in indexs:
907
+ idx2,code2=index_to_code[index2]
908
+ DFG.append((code1,idx1,'computedFrom',[code2],[idx2]))
909
+ states[code1]=[idx1]
910
+ return sorted(DFG,key=lambda x:x[1]),states
911
+ elif root_node.type in if_statement:
912
+ DFG=[]
913
+ current_states=states.copy()
914
+ others_states=[]
915
+ flag=False
916
+ tag=False
917
+ if 'else' in root_node.type:
918
+ tag=True
919
+ for child in root_node.children:
920
+ if 'else' in child.type:
921
+ tag=True
922
+ if child.type not in if_statement and flag is False:
923
+ temp,current_states=DFG_php(child,index_to_code,current_states)
924
+ DFG+=temp
925
+ else:
926
+ flag=True
927
+ temp,new_states=DFG_php(child,index_to_code,states)
928
+ DFG+=temp
929
+ others_states.append(new_states)
930
+ others_states.append(current_states)
931
+ new_states={}
932
+ for dic in others_states:
933
+ for key in dic:
934
+ if key not in new_states:
935
+ new_states[key]=dic[key].copy()
936
+ else:
937
+ new_states[key]+=dic[key]
938
+ for key in states:
939
+ if key not in new_states:
940
+ new_states[key]=states[key]
941
+ else:
942
+ new_states[key]+=states[key]
943
+ for key in new_states:
944
+ new_states[key]=sorted(list(set(new_states[key])))
945
+ return sorted(DFG,key=lambda x:x[1]),new_states
946
+ elif root_node.type in for_statement:
947
+ DFG=[]
948
+ for child in root_node.children:
949
+ temp,states=DFG_php(child,index_to_code,states)
950
+ DFG+=temp
951
+ flag=False
952
+ for child in root_node.children:
953
+ if flag:
954
+ temp,states=DFG_php(child,index_to_code,states)
955
+ DFG+=temp
956
+ elif child.type=="assignment_expression":
957
+ flag=True
958
+ dic={}
959
+ for x in DFG:
960
+ if (x[0],x[1],x[2]) not in dic:
961
+ dic[(x[0],x[1],x[2])]=[x[3],x[4]]
962
+ else:
963
+ dic[(x[0],x[1],x[2])][0]=list(set(dic[(x[0],x[1],x[2])][0]+x[3]))
964
+ dic[(x[0],x[1],x[2])][1]=sorted(list(set(dic[(x[0],x[1],x[2])][1]+x[4])))
965
+ DFG=[(x[0],x[1],x[2],y[0],y[1]) for x,y in sorted(dic.items(),key=lambda t:t[0][1])]
966
+ return sorted(DFG,key=lambda x:x[1]),states
967
+ elif root_node.type in enhanced_for_statement:
968
+ name=None
969
+ value=None
970
+ for child in root_node.children:
971
+ if child.type=='variable_name' and value is None:
972
+ value=child
973
+ elif child.type=='variable_name' and name is None:
974
+ name=child
975
+ break
976
+ body=root_node.child_by_field_name('body')
977
+ DFG=[]
978
+ for i in range(2):
979
+ temp,states=DFG_php(value,index_to_code,states)
980
+ DFG+=temp
981
+ name_indexs=tree_to_variable_index(name,index_to_code)
982
+ value_indexs=tree_to_variable_index(value,index_to_code)
983
+ for index1 in name_indexs:
984
+ idx1,code1=index_to_code[index1]
985
+ for index2 in value_indexs:
986
+ idx2,code2=index_to_code[index2]
987
+ DFG.append((code1,idx1,'computedFrom',[code2],[idx2]))
988
+ states[code1]=[idx1]
989
+ temp,states=DFG_php(body,index_to_code,states)
990
+ DFG+=temp
991
+ dic={}
992
+ for x in DFG:
993
+ if (x[0],x[1],x[2]) not in dic:
994
+ dic[(x[0],x[1],x[2])]=[x[3],x[4]]
995
+ else:
996
+ dic[(x[0],x[1],x[2])][0]=list(set(dic[(x[0],x[1],x[2])][0]+x[3]))
997
+ dic[(x[0],x[1],x[2])][1]=sorted(list(set(dic[(x[0],x[1],x[2])][1]+x[4])))
998
+ DFG=[(x[0],x[1],x[2],y[0],y[1]) for x,y in sorted(dic.items(),key=lambda t:t[0][1])]
999
+ return sorted(DFG,key=lambda x:x[1]),states
1000
+ elif root_node.type in while_statement:
1001
+ DFG=[]
1002
+ for i in range(2):
1003
+ for child in root_node.children:
1004
+ temp,states=DFG_php(child,index_to_code,states)
1005
+ DFG+=temp
1006
+ dic={}
1007
+ for x in DFG:
1008
+ if (x[0],x[1],x[2]) not in dic:
1009
+ dic[(x[0],x[1],x[2])]=[x[3],x[4]]
1010
+ else:
1011
+ dic[(x[0],x[1],x[2])][0]=list(set(dic[(x[0],x[1],x[2])][0]+x[3]))
1012
+ dic[(x[0],x[1],x[2])][1]=sorted(list(set(dic[(x[0],x[1],x[2])][1]+x[4])))
1013
+ DFG=[(x[0],x[1],x[2],y[0],y[1]) for x,y in sorted(dic.items(),key=lambda t:t[0][1])]
1014
+ return sorted(DFG,key=lambda x:x[1]),states
1015
+ else:
1016
+ DFG=[]
1017
+ for child in root_node.children:
1018
+ if child.type in do_first_statement:
1019
+ temp,states=DFG_php(child,index_to_code,states)
1020
+ DFG+=temp
1021
+ for child in root_node.children:
1022
+ if child.type not in do_first_statement:
1023
+ temp,states=DFG_php(child,index_to_code,states)
1024
+ DFG+=temp
1025
+
1026
+ return sorted(DFG,key=lambda x:x[1]),states
1027
+
1028
+
1029
+ def DFG_javascript(root_node,index_to_code,states):
1030
+ assignment=['assignment_pattern','augmented_assignment_expression']
1031
+ def_statement=['variable_declarator']
1032
+ increment_statement=['update_expression']
1033
+ if_statement=['if_statement','else']
1034
+ for_statement=['for_statement']
1035
+ enhanced_for_statement=[]
1036
+ while_statement=['while_statement']
1037
+ do_first_statement=[]
1038
+ states=states.copy()
1039
+ if (len(root_node.children)==0 or root_node.type in ['string_literal','string','character_literal']) and root_node.type!='comment':
1040
+ idx,code=index_to_code[(root_node.start_point,root_node.end_point)]
1041
+ if root_node.type==code:
1042
+ return [],states
1043
+ elif code in states:
1044
+ return [(code,idx,'comesFrom',[code],states[code].copy())],states
1045
+ else:
1046
+ if root_node.type=='identifier':
1047
+ states[code]=[idx]
1048
+ return [(code,idx,'comesFrom',[],[])],states
1049
+ elif root_node.type in def_statement:
1050
+ name=root_node.child_by_field_name('name')
1051
+ value=root_node.child_by_field_name('value')
1052
+ DFG=[]
1053
+ if value is None:
1054
+ indexs=tree_to_variable_index(name,index_to_code)
1055
+ for index in indexs:
1056
+ idx,code=index_to_code[index]
1057
+ DFG.append((code,idx,'comesFrom',[],[]))
1058
+ states[code]=[idx]
1059
+ return sorted(DFG,key=lambda x:x[1]),states
1060
+ else:
1061
+ name_indexs=tree_to_variable_index(name,index_to_code)
1062
+ value_indexs=tree_to_variable_index(value,index_to_code)
1063
+ temp,states=DFG_javascript(value,index_to_code,states)
1064
+ DFG+=temp
1065
+ for index1 in name_indexs:
1066
+ idx1,code1=index_to_code[index1]
1067
+ for index2 in value_indexs:
1068
+ idx2,code2=index_to_code[index2]
1069
+ DFG.append((code1,idx1,'comesFrom',[code2],[idx2]))
1070
+ states[code1]=[idx1]
1071
+ return sorted(DFG,key=lambda x:x[1]),states
1072
+ elif root_node.type in assignment:
1073
+ left_nodes=root_node.child_by_field_name('left')
1074
+ right_nodes=root_node.child_by_field_name('right')
1075
+ DFG=[]
1076
+ temp,states=DFG_javascript(right_nodes,index_to_code,states)
1077
+ DFG+=temp
1078
+ name_indexs=tree_to_variable_index(left_nodes,index_to_code)
1079
+ value_indexs=tree_to_variable_index(right_nodes,index_to_code)
1080
+ for index1 in name_indexs:
1081
+ idx1,code1=index_to_code[index1]
1082
+ for index2 in value_indexs:
1083
+ idx2,code2=index_to_code[index2]
1084
+ DFG.append((code1,idx1,'computedFrom',[code2],[idx2]))
1085
+ states[code1]=[idx1]
1086
+ return sorted(DFG,key=lambda x:x[1]),states
1087
+ elif root_node.type in increment_statement:
1088
+ DFG=[]
1089
+ indexs=tree_to_variable_index(root_node,index_to_code)
1090
+ for index1 in indexs:
1091
+ idx1,code1=index_to_code[index1]
1092
+ for index2 in indexs:
1093
+ idx2,code2=index_to_code[index2]
1094
+ DFG.append((code1,idx1,'computedFrom',[code2],[idx2]))
1095
+ states[code1]=[idx1]
1096
+ return sorted(DFG,key=lambda x:x[1]),states
1097
+ elif root_node.type in if_statement:
1098
+ DFG=[]
1099
+ current_states=states.copy()
1100
+ others_states=[]
1101
+ flag=False
1102
+ tag=False
1103
+ if 'else' in root_node.type:
1104
+ tag=True
1105
+ for child in root_node.children:
1106
+ if 'else' in child.type:
1107
+ tag=True
1108
+ if child.type not in if_statement and flag is False:
1109
+ temp,current_states=DFG_javascript(child,index_to_code,current_states)
1110
+ DFG+=temp
1111
+ else:
1112
+ flag=True
1113
+ temp,new_states=DFG_javascript(child,index_to_code,states)
1114
+ DFG+=temp
1115
+ others_states.append(new_states)
1116
+ others_states.append(current_states)
1117
+ if tag is False:
1118
+ others_states.append(states)
1119
+ new_states={}
1120
+ for dic in others_states:
1121
+ for key in dic:
1122
+ if key not in new_states:
1123
+ new_states[key]=dic[key].copy()
1124
+ else:
1125
+ new_states[key]+=dic[key]
1126
+ for key in states:
1127
+ if key not in new_states:
1128
+ new_states[key]=states[key]
1129
+ else:
1130
+ new_states[key]+=states[key]
1131
+ for key in new_states:
1132
+ new_states[key]=sorted(list(set(new_states[key])))
1133
+ return sorted(DFG,key=lambda x:x[1]),new_states
1134
+ elif root_node.type in for_statement:
1135
+ DFG=[]
1136
+ for child in root_node.children:
1137
+ temp,states=DFG_javascript(child,index_to_code,states)
1138
+ DFG+=temp
1139
+ flag=False
1140
+ for child in root_node.children:
1141
+ if flag:
1142
+ temp,states=DFG_javascript(child,index_to_code,states)
1143
+ DFG+=temp
1144
+ elif child.type=="variable_declaration":
1145
+ flag=True
1146
+ dic={}
1147
+ for x in DFG:
1148
+ if (x[0],x[1],x[2]) not in dic:
1149
+ dic[(x[0],x[1],x[2])]=[x[3],x[4]]
1150
+ else:
1151
+ dic[(x[0],x[1],x[2])][0]=list(set(dic[(x[0],x[1],x[2])][0]+x[3]))
1152
+ dic[(x[0],x[1],x[2])][1]=sorted(list(set(dic[(x[0],x[1],x[2])][1]+x[4])))
1153
+ DFG=[(x[0],x[1],x[2],y[0],y[1]) for x,y in sorted(dic.items(),key=lambda t:t[0][1])]
1154
+ return sorted(DFG,key=lambda x:x[1]),states
1155
+ elif root_node.type in while_statement:
1156
+ DFG=[]
1157
+ for i in range(2):
1158
+ for child in root_node.children:
1159
+ temp,states=DFG_javascript(child,index_to_code,states)
1160
+ DFG+=temp
1161
+ dic={}
1162
+ for x in DFG:
1163
+ if (x[0],x[1],x[2]) not in dic:
1164
+ dic[(x[0],x[1],x[2])]=[x[3],x[4]]
1165
+ else:
1166
+ dic[(x[0],x[1],x[2])][0]=list(set(dic[(x[0],x[1],x[2])][0]+x[3]))
1167
+ dic[(x[0],x[1],x[2])][1]=sorted(list(set(dic[(x[0],x[1],x[2])][1]+x[4])))
1168
+ DFG=[(x[0],x[1],x[2],y[0],y[1]) for x,y in sorted(dic.items(),key=lambda t:t[0][1])]
1169
+ return sorted(DFG,key=lambda x:x[1]),states
1170
+ else:
1171
+ DFG=[]
1172
+ for child in root_node.children:
1173
+ if child.type in do_first_statement:
1174
+ temp,states=DFG_javascript(child,index_to_code,states)
1175
+ DFG+=temp
1176
+ for child in root_node.children:
1177
+ if child.type not in do_first_statement:
1178
+ temp,states=DFG_javascript(child,index_to_code,states)
1179
+ DFG+=temp
1180
+
1181
+ return sorted(DFG,key=lambda x:x[1]),states
1182
+
1183
+
1184
+
parser/__init__.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Microsoft Corporation.
2
+ # Licensed under the MIT license.
3
+
4
+ from .utils import (remove_comments_and_docstrings,
5
+ tree_to_token_index,
6
+ index_to_code_token,
7
+ tree_to_variable_index)
8
+ from .DFG import DFG_python,DFG_java,DFG_ruby,DFG_go,DFG_php,DFG_javascript,DFG_csharp
parser/build.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Microsoft Corporation.
2
+ # Licensed under the MIT license.
3
+
4
+ from tree_sitter import Language, Parser
5
+
6
+ Language.build_library(
7
+ # Store the library in the `build` directory
8
+ 'my-languages.so',
9
+
10
+ # Include one or more languages
11
+ [
12
+ #'tree-sitter-go',
13
+ #'tree-sitter-javascript',
14
+ 'tree-sitter-python',
15
+ #'tree-sitter-php',
16
+ #'tree-sitter-java',
17
+ #'tree-sitter-ruby',
18
+ #'tree-sitter-c-sharp',
19
+ #'tree-sitter-cpp'
20
+ ]
21
+ )
22
+
parser/build.sh ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ git clone https://github.com/tree-sitter/tree-sitter-go
2
+ git clone https://github.com/tree-sitter/tree-sitter-javascript
3
+ git clone https://github.com/tree-sitter/tree-sitter-python
4
+ git clone https://github.com/tree-sitter/tree-sitter-ruby
5
+ git clone https://github.com/tree-sitter/tree-sitter-php
6
+ git clone https://github.com/tree-sitter/tree-sitter-java
7
+ git clone https://github.com/tree-sitter/tree-sitter-c-sharp
8
+ python build.py
parser/tree-sitter-c-sharp ADDED
@@ -0,0 +1 @@
 
 
1
+ Subproject commit 5b60f99545fea00a33bbfae5be956f684c4c69e2
parser/tree-sitter-cpp ADDED
@@ -0,0 +1 @@
 
 
1
+ Subproject commit 5ead1e26c6ab71919db0f1880c46a278a93bc5ea
parser/tree-sitter-go ADDED
@@ -0,0 +1 @@
 
 
1
+ Subproject commit 05900faa3cdb5d2d8c8bd5e77ee698487e0a8611
parser/tree-sitter-java ADDED
@@ -0,0 +1 @@
 
 
1
+ Subproject commit 09d650def6cdf7f479f4b78f595e9ef5b58ce31e
parser/tree-sitter-javascript ADDED
@@ -0,0 +1 @@
 
 
1
+ Subproject commit 936d976a782e75395d9b1c8c7c7bf4ba6fe0d86b
parser/tree-sitter-php ADDED
@@ -0,0 +1 @@
 
 
1
+ Subproject commit ab2e72179ceb8bb0b249c8ac9162a148e911b3dc
parser/tree-sitter-python ADDED
@@ -0,0 +1 @@
 
 
1
+ Subproject commit b14614e2144b8f9ee54deed5a24f3c6f51f9ffa8
parser/tree-sitter-ruby ADDED
@@ -0,0 +1 @@
 
 
1
+ Subproject commit 252ca18be76b0918fb6b34c302292b6931876c25
parser/utils.py ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Microsoft Corporation.
2
+ # Licensed under the MIT license.
3
+
4
+ import re
5
+ from io import StringIO
6
+ import tokenize
7
+ def remove_comments_and_docstrings(source,lang):
8
+ if lang in ['python']:
9
+ """
10
+ Returns 'source' minus comments and docstrings.
11
+ """
12
+ io_obj = StringIO(source)
13
+ out = ""
14
+ prev_toktype = tokenize.INDENT
15
+ last_lineno = -1
16
+ last_col = 0
17
+ for tok in tokenize.generate_tokens(io_obj.readline):
18
+ token_type = tok[0]
19
+ token_string = tok[1]
20
+ start_line, start_col = tok[2]
21
+ end_line, end_col = tok[3]
22
+ ltext = tok[4]
23
+ if start_line > last_lineno:
24
+ last_col = 0
25
+ if start_col > last_col:
26
+ out += (" " * (start_col - last_col))
27
+ # Remove comments:
28
+ if token_type == tokenize.COMMENT:
29
+ pass
30
+ # This series of conditionals removes docstrings:
31
+ elif token_type == tokenize.STRING:
32
+ if prev_toktype != tokenize.INDENT:
33
+ # This is likely a docstring; double-check we're not inside an operator:
34
+ if prev_toktype != tokenize.NEWLINE:
35
+ if start_col > 0:
36
+ out += token_string
37
+ else:
38
+ out += token_string
39
+ prev_toktype = token_type
40
+ last_col = end_col
41
+ last_lineno = end_line
42
+ temp=[]
43
+ for x in out.split('\n'):
44
+ if x.strip()!="":
45
+ temp.append(x)
46
+ return '\n'.join(temp)
47
+ elif lang in ['ruby']:
48
+ return source
49
+ else:
50
+ def replacer(match):
51
+ s = match.group(0)
52
+ if s.startswith('/'):
53
+ return " " # note: a space and not an empty string
54
+ else:
55
+ return s
56
+ pattern = re.compile(
57
+ r'//.*?$|/\*.*?\*/|\'(?:\\.|[^\\\'])*\'|"(?:\\.|[^\\"])*"',
58
+ re.DOTALL | re.MULTILINE
59
+ )
60
+ temp=[]
61
+ for x in re.sub(pattern, replacer, source).split('\n'):
62
+ if x.strip()!="":
63
+ temp.append(x)
64
+ return '\n'.join(temp)
65
+
66
+ def tree_to_token_index(root_node):
67
+ if (len(root_node.children)==0 or root_node.type in ['string_literal','string','character_literal']) and root_node.type!='comment':
68
+ return [(root_node.start_point,root_node.end_point)]
69
+ else:
70
+ code_tokens=[]
71
+ for child in root_node.children:
72
+ code_tokens+=tree_to_token_index(child)
73
+ return code_tokens
74
+
75
+ def tree_to_variable_index(root_node,index_to_code):
76
+ if (len(root_node.children)==0 or root_node.type in ['string_literal','string','character_literal']) and root_node.type!='comment':
77
+ index=(root_node.start_point,root_node.end_point)
78
+ _,code=index_to_code[index]
79
+ if root_node.type!=code:
80
+ return [(root_node.start_point,root_node.end_point)]
81
+ else:
82
+ return []
83
+ else:
84
+ code_tokens=[]
85
+ for child in root_node.children:
86
+ code_tokens+=tree_to_variable_index(child,index_to_code)
87
+ return code_tokens
88
+
89
+ def index_to_code_token(index,code):
90
+ start_point=index[0]
91
+ end_point=index[1]
92
+ if start_point[0]==end_point[0]:
93
+ s=code[start_point[0]][start_point[1]:end_point[1]]
94
+ else:
95
+ s=""
96
+ s+=code[start_point[0]][start_point[1]:]
97
+ for i in range(start_point[0]+1,end_point[0]):
98
+ s+=code[i]
99
+ s+=code[end_point[0]][:end_point[1]]
100
+ return s
101
+
requirements.txt CHANGED
@@ -1 +1,2 @@
1
- git+https://github.com/huggingface/evaluate@main
 
 
1
+ git+https://github.com/huggingface/evaluate@main
2
+ tree_sitter
syntax_match.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) Microsoft Corporation.
2
+ # Licensed under the MIT license.
3
+
4
+ from parser import DFG_python,DFG_java,DFG_ruby,DFG_go,DFG_php,DFG_javascript,DFG_csharp
5
+ from parser import (remove_comments_and_docstrings,
6
+ tree_to_token_index,
7
+ index_to_code_token,
8
+ tree_to_variable_index)
9
+ from tree_sitter import Language, Parser
10
+
11
+ dfg_function={
12
+ 'python':DFG_python,
13
+ 'java':DFG_java,
14
+ 'ruby':DFG_ruby,
15
+ 'go':DFG_go,
16
+ 'php':DFG_php,
17
+ 'javascript':DFG_javascript,
18
+ 'c_sharp':DFG_csharp,
19
+ }
20
+
21
+ def calc_syntax_match(references, candidate, lang):
22
+ return corpus_syntax_match([references], [candidate], lang)
23
+
24
+ def corpus_syntax_match(references, candidates, lang):
25
+ JAVA_LANGUAGE = Language('parser/my-languages.so', lang)
26
+ parser = Parser()
27
+ parser.set_language(JAVA_LANGUAGE)
28
+ match_count = 0
29
+ total_count = 0
30
+
31
+ for i in range(len(candidates)):
32
+ references_sample = references[i]
33
+ candidate = candidates[i]
34
+ for reference in references_sample:
35
+ try:
36
+ candidate=remove_comments_and_docstrings(candidate,'java')
37
+ except:
38
+ pass
39
+ try:
40
+ reference=remove_comments_and_docstrings(reference,'java')
41
+ except:
42
+ pass
43
+
44
+ candidate_tree = parser.parse(bytes(candidate,'utf8')).root_node
45
+
46
+ reference_tree = parser.parse(bytes(reference,'utf8')).root_node
47
+
48
+ def get_all_sub_trees(root_node):
49
+ node_stack = []
50
+ sub_tree_sexp_list = []
51
+ depth = 1
52
+ node_stack.append([root_node, depth])
53
+ while len(node_stack) != 0:
54
+ cur_node, cur_depth = node_stack.pop()
55
+ sub_tree_sexp_list.append([cur_node.sexp(), cur_depth])
56
+ for child_node in cur_node.children:
57
+ if len(child_node.children) != 0:
58
+ depth = cur_depth + 1
59
+ node_stack.append([child_node, depth])
60
+ return sub_tree_sexp_list
61
+ cand_sexps = [x[0] for x in get_all_sub_trees(candidate_tree)]
62
+ ref_sexps = get_all_sub_trees(reference_tree)
63
+
64
+ # print(cand_sexps)
65
+ # print(ref_sexps)
66
+
67
+ for sub_tree, depth in ref_sexps:
68
+ if sub_tree in cand_sexps:
69
+ match_count += 1
70
+ total_count += len(ref_sexps)
71
+
72
+ score = match_count / total_count
73
+ return score
weighted_ngram_match.py ADDED
@@ -0,0 +1,558 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ # Copyright (c) Microsoft Corporation.
3
+ # Licensed under the MIT license.
4
+
5
+ # Natural Language Toolkit: BLEU Score
6
+ #
7
+ # Copyright (C) 2001-2020 NLTK Project
8
+ # Authors: Chin Yee Lee, Hengfeng Li, Ruxin Hou, Calvin Tanujaya Lim
9
+ # Contributors: Björn Mattsson, Dmitrijs Milajevs, Liling Tan
10
+ # URL: <http://nltk.org/>
11
+ # For license information, see LICENSE.TXT
12
+
13
+ """BLEU score implementation."""
14
+
15
+ import math
16
+ import sys
17
+ from fractions import Fraction
18
+ import warnings
19
+ from collections import Counter
20
+
21
+ from utils import ngrams
22
+ import pdb
23
+
24
+
25
+ def sentence_bleu(
26
+ references,
27
+ hypothesis,
28
+ weights=(0.25, 0.25, 0.25, 0.25),
29
+ smoothing_function=None,
30
+ auto_reweigh=False,
31
+ ):
32
+ """
33
+ Calculate BLEU score (Bilingual Evaluation Understudy) from
34
+ Papineni, Kishore, Salim Roukos, Todd Ward, and Wei-Jing Zhu. 2002.
35
+ "BLEU: a method for automatic evaluation of machine translation."
36
+ In Proceedings of ACL. http://www.aclweb.org/anthology/P02-1040.pdf
37
+ >>> hypothesis1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which',
38
+ ... 'ensures', 'that', 'the', 'military', 'always',
39
+ ... 'obeys', 'the', 'commands', 'of', 'the', 'party']
40
+ >>> hypothesis2 = ['It', 'is', 'to', 'insure', 'the', 'troops',
41
+ ... 'forever', 'hearing', 'the', 'activity', 'guidebook',
42
+ ... 'that', 'party', 'direct']
43
+ >>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that',
44
+ ... 'ensures', 'that', 'the', 'military', 'will', 'forever',
45
+ ... 'heed', 'Party', 'commands']
46
+ >>> reference2 = ['It', 'is', 'the', 'guiding', 'principle', 'which',
47
+ ... 'guarantees', 'the', 'military', 'forces', 'always',
48
+ ... 'being', 'under', 'the', 'command', 'of', 'the',
49
+ ... 'Party']
50
+ >>> reference3 = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the',
51
+ ... 'army', 'always', 'to', 'heed', 'the', 'directions',
52
+ ... 'of', 'the', 'party']
53
+ >>> sentence_bleu([reference1, reference2, reference3], hypothesis1) # doctest: +ELLIPSIS
54
+ 0.5045...
55
+ If there is no ngrams overlap for any order of n-grams, BLEU returns the
56
+ value 0. This is because the precision for the order of n-grams without
57
+ overlap is 0, and the geometric mean in the final BLEU score computation
58
+ multiplies the 0 with the precision of other n-grams. This results in 0
59
+ (independently of the precision of the othe n-gram orders). The following
60
+ example has zero 3-gram and 4-gram overlaps:
61
+ >>> round(sentence_bleu([reference1, reference2, reference3], hypothesis2),4) # doctest: +ELLIPSIS
62
+ 0.0
63
+ To avoid this harsh behaviour when no ngram overlaps are found a smoothing
64
+ function can be used.
65
+ >>> chencherry = SmoothingFunction()
66
+ >>> sentence_bleu([reference1, reference2, reference3], hypothesis2,
67
+ ... smoothing_function=chencherry.method1) # doctest: +ELLIPSIS
68
+ 0.0370...
69
+ The default BLEU calculates a score for up to 4-grams using uniform
70
+ weights (this is called BLEU-4). To evaluate your translations with
71
+ higher/lower order ngrams, use customized weights. E.g. when accounting
72
+ for up to 5-grams with uniform weights (this is called BLEU-5) use:
73
+ >>> weights = (1./5., 1./5., 1./5., 1./5., 1./5.)
74
+ >>> sentence_bleu([reference1, reference2, reference3], hypothesis1, weights) # doctest: +ELLIPSIS
75
+ 0.3920...
76
+ :param references: reference sentences
77
+ :type references: list(list(str))
78
+ :param hypothesis: a hypothesis sentence
79
+ :type hypothesis: list(str)
80
+ :param weights: weights for unigrams, bigrams, trigrams and so on
81
+ :type weights: list(float)
82
+ :param smoothing_function:
83
+ :type smoothing_function: SmoothingFunction
84
+ :param auto_reweigh: Option to re-normalize the weights uniformly.
85
+ :type auto_reweigh: bool
86
+ :return: The sentence-level BLEU score.
87
+ :rtype: float
88
+ """
89
+ return corpus_bleu(
90
+ [references], [hypothesis], weights, smoothing_function, auto_reweigh
91
+ )
92
+
93
+
94
+ def corpus_bleu(
95
+ list_of_references,
96
+ hypotheses,
97
+ weights=(0.25, 0.25, 0.25, 0.25),
98
+ smoothing_function=None,
99
+ auto_reweigh=False,
100
+ ):
101
+ """
102
+ Calculate a single corpus-level BLEU score (aka. system-level BLEU) for all
103
+ the hypotheses and their respective references.
104
+ Instead of averaging the sentence level BLEU scores (i.e. marco-average
105
+ precision), the original BLEU metric (Papineni et al. 2002) accounts for
106
+ the micro-average precision (i.e. summing the numerators and denominators
107
+ for each hypothesis-reference(s) pairs before the division).
108
+ >>> hyp1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which',
109
+ ... 'ensures', 'that', 'the', 'military', 'always',
110
+ ... 'obeys', 'the', 'commands', 'of', 'the', 'party']
111
+ >>> ref1a = ['It', 'is', 'a', 'guide', 'to', 'action', 'that',
112
+ ... 'ensures', 'that', 'the', 'military', 'will', 'forever',
113
+ ... 'heed', 'Party', 'commands']
114
+ >>> ref1b = ['It', 'is', 'the', 'guiding', 'principle', 'which',
115
+ ... 'guarantees', 'the', 'military', 'forces', 'always',
116
+ ... 'being', 'under', 'the', 'command', 'of', 'the', 'Party']
117
+ >>> ref1c = ['It', 'is', 'the', 'practical', 'guide', 'for', 'the',
118
+ ... 'army', 'always', 'to', 'heed', 'the', 'directions',
119
+ ... 'of', 'the', 'party']
120
+ >>> hyp2 = ['he', 'read', 'the', 'book', 'because', 'he', 'was',
121
+ ... 'interested', 'in', 'world', 'history']
122
+ >>> ref2a = ['he', 'was', 'interested', 'in', 'world', 'history',
123
+ ... 'because', 'he', 'read', 'the', 'book']
124
+ >>> list_of_references = [[ref1a, ref1b, ref1c], [ref2a]]
125
+ >>> hypotheses = [hyp1, hyp2]
126
+ >>> corpus_bleu(list_of_references, hypotheses) # doctest: +ELLIPSIS
127
+ 0.5920...
128
+ The example below show that corpus_bleu() is different from averaging
129
+ sentence_bleu() for hypotheses
130
+ >>> score1 = sentence_bleu([ref1a, ref1b, ref1c], hyp1)
131
+ >>> score2 = sentence_bleu([ref2a], hyp2)
132
+ >>> (score1 + score2) / 2 # doctest: +ELLIPSIS
133
+ 0.6223...
134
+ :param list_of_references: a corpus of lists of reference sentences, w.r.t. hypotheses
135
+ :type list_of_references: list(list(list(str)))
136
+ :param hypotheses: a list of hypothesis sentences
137
+ :type hypotheses: list(list(str))
138
+ :param weights: weights for unigrams, bigrams, trigrams and so on
139
+ :type weights: list(float)
140
+ :param smoothing_function:
141
+ :type smoothing_function: SmoothingFunction
142
+ :param auto_reweigh: Option to re-normalize the weights uniformly.
143
+ :type auto_reweigh: bool
144
+ :return: The corpus-level BLEU score.
145
+ :rtype: float
146
+ """
147
+ # Before proceeding to compute BLEU, perform sanity checks.
148
+
149
+ p_numerators = Counter() # Key = ngram order, and value = no. of ngram matches.
150
+ p_denominators = Counter() # Key = ngram order, and value = no. of ngram in ref.
151
+ hyp_lengths, ref_lengths = 0, 0
152
+
153
+ assert len(list_of_references) == len(hypotheses), (
154
+ "The number of hypotheses and their reference(s) should be the " "same "
155
+ )
156
+
157
+ # Iterate through each hypothesis and their corresponding references.
158
+ for references, hypothesis in zip(list_of_references, hypotheses):
159
+ # For each order of ngram, calculate the numerator and
160
+ # denominator for the corpus-level modified precision.
161
+ for i, _ in enumerate(weights, start=1):
162
+ p_i_numeraotr, p_i_denominator = modified_recall(references, hypothesis, i)
163
+ p_numerators[i] += p_i_numeraotr
164
+ p_denominators[i] += p_i_denominator
165
+
166
+ # Calculate the hypothesis length and the closest reference length.
167
+ # Adds them to the corpus-level hypothesis and reference counts.
168
+ hyp_len = len(hypothesis)
169
+ hyp_lengths += hyp_len
170
+ ref_lengths += closest_ref_length(references, hyp_len)
171
+
172
+ # Calculate corpus-level brevity penalty.
173
+ bp = brevity_penalty(ref_lengths, hyp_lengths)
174
+
175
+ # Uniformly re-weighting based on maximum hypothesis lengths if largest
176
+ # order of n-grams < 4 and weights is set at default.
177
+ if auto_reweigh:
178
+ if hyp_lengths < 4 and weights == (0.25, 0.25, 0.25, 0.25):
179
+ weights = (1 / hyp_lengths,) * hyp_lengths
180
+
181
+ # Collects the various recall values for the different ngram orders.
182
+ p_n = [
183
+ (p_numerators[i], p_denominators[i])
184
+ for i, _ in enumerate(weights, start=1)
185
+ ]
186
+
187
+ # Returns 0 if there's no matching n-grams
188
+ # We only need to check for p_numerators[1] == 0, since if there's
189
+ # no unigrams, there won't be any higher order ngrams.
190
+ if p_numerators[1] == 0:
191
+ return 0
192
+
193
+ # If there's no smoothing, set use method0 from SmoothinFunction class.
194
+ if not smoothing_function:
195
+ smoothing_function = SmoothingFunction().method1
196
+ # Smoothen the modified precision.
197
+ # Note: smoothing_function() may convert values into floats;
198
+ # it tries to retain the Fraction object as much as the
199
+ # smoothing method allows.
200
+ p_n = smoothing_function(
201
+ p_n, references=references, hypothesis=hypothesis, hyp_len=hyp_lengths
202
+ )
203
+ # pdb.set_trace()
204
+ s = (w_i * math.log(p_i[0]/p_i[1]) for w_i, p_i in zip(weights, p_n))
205
+ s = bp * math.exp(math.fsum(s))
206
+ return s
207
+
208
+
209
+ def modified_recall(references, hypothesis, n):
210
+ """
211
+ Calculate modified ngram recall.
212
+ :param references: A list of reference translations.
213
+ :type references: list(list(str))
214
+ :param hypothesis: A hypothesis translation.
215
+ :type hypothesis: list(str)
216
+ :param n: The ngram order.
217
+ :type n: int
218
+ :return: BLEU's modified precision for the nth order ngram.
219
+ :rtype: Fraction
220
+ """
221
+ # Extracts all ngrams in hypothesis
222
+ # Set an empty Counter if hypothesis is empty.
223
+ # pdb.set_trace()
224
+ numerator = 0
225
+ denominator = 0
226
+
227
+ counts = Counter(ngrams(hypothesis, n)) if len(hypothesis) >= n else Counter()
228
+ # Extract a union of references' counts.
229
+ # max_counts = reduce(or_, [Counter(ngrams(ref, n)) for ref in references])
230
+ max_counts = {}
231
+ for reference_and_weights in references:
232
+ reference = reference_and_weights[0]
233
+ weights = reference_and_weights[1]
234
+ reference_counts = (
235
+ Counter(ngrams(reference, n)) if len(reference) >= n else Counter()
236
+ )
237
+ # for ngram in reference_counts:
238
+ # max_counts[ngram] = max(max_counts.get(ngram, 0), counts[ngram])
239
+ clipped_counts = {
240
+ ngram: min(count, counts[ngram]) for ngram, count in reference_counts.items()
241
+ }
242
+ # reweight
243
+ if n == 1 and len(weights) == len(reference_counts):
244
+ def weighted_sum(weights, counts):
245
+ sum_counts = 0
246
+ for ngram, count in counts.items():
247
+ sum_counts += count * (weights[ngram[0]] if ngram[0] in weights else 1)
248
+ return sum_counts
249
+
250
+ numerator += weighted_sum(weights, clipped_counts)
251
+ denominator += max(1, weighted_sum(weights, reference_counts))
252
+
253
+ else:
254
+ numerator += sum(clipped_counts.values())
255
+ denominator += max(1, sum(reference_counts.values()))
256
+
257
+ # # Assigns the intersection between hypothesis and references' counts.
258
+ # clipped_counts = {
259
+ # ngram: min(count, max_counts[ngram]) for ngram, count in counts.items()
260
+ # }
261
+
262
+ # numerator += sum(clipped_counts.values())
263
+ # # Ensures that denominator is minimum 1 to avoid ZeroDivisionError.
264
+ # # Usually this happens when the ngram order is > len(reference).
265
+ # denominator += max(1, sum(counts.values()))
266
+
267
+ #return Fraction(numerator, denominator, _normalize=False)
268
+ return numerator, denominator
269
+
270
+
271
+ def closest_ref_length(references, hyp_len):
272
+ """
273
+ This function finds the reference that is the closest length to the
274
+ hypothesis. The closest reference length is referred to as *r* variable
275
+ from the brevity penalty formula in Papineni et. al. (2002)
276
+ :param references: A list of reference translations.
277
+ :type references: list(list(str))
278
+ :param hyp_len: The length of the hypothesis.
279
+ :type hyp_len: int
280
+ :return: The length of the reference that's closest to the hypothesis.
281
+ :rtype: int
282
+ """
283
+ ref_lens = (len(reference) for reference in references)
284
+ closest_ref_len = min(
285
+ ref_lens, key=lambda ref_len: (abs(ref_len - hyp_len), ref_len)
286
+ )
287
+ return closest_ref_len
288
+
289
+
290
+ def brevity_penalty(closest_ref_len, hyp_len):
291
+ """
292
+ Calculate brevity penalty.
293
+ As the modified n-gram precision still has the problem from the short
294
+ length sentence, brevity penalty is used to modify the overall BLEU
295
+ score according to length.
296
+ An example from the paper. There are three references with length 12, 15
297
+ and 17. And a concise hypothesis of the length 12. The brevity penalty is 1.
298
+ >>> reference1 = list('aaaaaaaaaaaa') # i.e. ['a'] * 12
299
+ >>> reference2 = list('aaaaaaaaaaaaaaa') # i.e. ['a'] * 15
300
+ >>> reference3 = list('aaaaaaaaaaaaaaaaa') # i.e. ['a'] * 17
301
+ >>> hypothesis = list('aaaaaaaaaaaa') # i.e. ['a'] * 12
302
+ >>> references = [reference1, reference2, reference3]
303
+ >>> hyp_len = len(hypothesis)
304
+ >>> closest_ref_len = closest_ref_length(references, hyp_len)
305
+ >>> brevity_penalty(closest_ref_len, hyp_len)
306
+ 1.0
307
+ In case a hypothesis translation is shorter than the references, penalty is
308
+ applied.
309
+ >>> references = [['a'] * 28, ['a'] * 28]
310
+ >>> hypothesis = ['a'] * 12
311
+ >>> hyp_len = len(hypothesis)
312
+ >>> closest_ref_len = closest_ref_length(references, hyp_len)
313
+ >>> brevity_penalty(closest_ref_len, hyp_len)
314
+ 0.2635971381157267
315
+ The length of the closest reference is used to compute the penalty. If the
316
+ length of a hypothesis is 12, and the reference lengths are 13 and 2, the
317
+ penalty is applied because the hypothesis length (12) is less then the
318
+ closest reference length (13).
319
+ >>> references = [['a'] * 13, ['a'] * 2]
320
+ >>> hypothesis = ['a'] * 12
321
+ >>> hyp_len = len(hypothesis)
322
+ >>> closest_ref_len = closest_ref_length(references, hyp_len)
323
+ >>> brevity_penalty(closest_ref_len, hyp_len) # doctest: +ELLIPSIS
324
+ 0.9200...
325
+ The brevity penalty doesn't depend on reference order. More importantly,
326
+ when two reference sentences are at the same distance, the shortest
327
+ reference sentence length is used.
328
+ >>> references = [['a'] * 13, ['a'] * 11]
329
+ >>> hypothesis = ['a'] * 12
330
+ >>> hyp_len = len(hypothesis)
331
+ >>> closest_ref_len = closest_ref_length(references, hyp_len)
332
+ >>> bp1 = brevity_penalty(closest_ref_len, hyp_len)
333
+ >>> hyp_len = len(hypothesis)
334
+ >>> closest_ref_len = closest_ref_length(reversed(references), hyp_len)
335
+ >>> bp2 = brevity_penalty(closest_ref_len, hyp_len)
336
+ >>> bp1 == bp2 == 1
337
+ True
338
+ A test example from mteval-v13a.pl (starting from the line 705):
339
+ >>> references = [['a'] * 11, ['a'] * 8]
340
+ >>> hypothesis = ['a'] * 7
341
+ >>> hyp_len = len(hypothesis)
342
+ >>> closest_ref_len = closest_ref_length(references, hyp_len)
343
+ >>> brevity_penalty(closest_ref_len, hyp_len) # doctest: +ELLIPSIS
344
+ 0.8668...
345
+ >>> references = [['a'] * 11, ['a'] * 8, ['a'] * 6, ['a'] * 7]
346
+ >>> hypothesis = ['a'] * 7
347
+ >>> hyp_len = len(hypothesis)
348
+ >>> closest_ref_len = closest_ref_length(references, hyp_len)
349
+ >>> brevity_penalty(closest_ref_len, hyp_len)
350
+ 1.0
351
+ :param hyp_len: The length of the hypothesis for a single sentence OR the
352
+ sum of all the hypotheses' lengths for a corpus
353
+ :type hyp_len: int
354
+ :param closest_ref_len: The length of the closest reference for a single
355
+ hypothesis OR the sum of all the closest references for every hypotheses.
356
+ :type closest_ref_len: int
357
+ :return: BLEU's brevity penalty.
358
+ :rtype: float
359
+ """
360
+ if hyp_len > closest_ref_len:
361
+ return 1
362
+ # If hypothesis is empty, brevity penalty = 0 should result in BLEU = 0.0
363
+ elif hyp_len == 0:
364
+ return 0
365
+ else:
366
+ return math.exp(1 - closest_ref_len / hyp_len)
367
+
368
+
369
+ class SmoothingFunction:
370
+ """
371
+ This is an implementation of the smoothing techniques
372
+ for segment-level BLEU scores that was presented in
373
+ Boxing Chen and Collin Cherry (2014) A Systematic Comparison of
374
+ Smoothing Techniques for Sentence-Level BLEU. In WMT14.
375
+ http://acl2014.org/acl2014/W14-33/pdf/W14-3346.pdf
376
+ """
377
+
378
+ def __init__(self, epsilon=0.1, alpha=5, k=5):
379
+ """
380
+ This will initialize the parameters required for the various smoothing
381
+ techniques, the default values are set to the numbers used in the
382
+ experiments from Chen and Cherry (2014).
383
+ >>> hypothesis1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'which', 'ensures',
384
+ ... 'that', 'the', 'military', 'always', 'obeys', 'the',
385
+ ... 'commands', 'of', 'the', 'party']
386
+ >>> reference1 = ['It', 'is', 'a', 'guide', 'to', 'action', 'that', 'ensures',
387
+ ... 'that', 'the', 'military', 'will', 'forever', 'heed',
388
+ ... 'Party', 'commands']
389
+ >>> chencherry = SmoothingFunction()
390
+ >>> print(sentence_bleu([reference1], hypothesis1)) # doctest: +ELLIPSIS
391
+ 0.4118...
392
+ >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method0)) # doctest: +ELLIPSIS
393
+ 0.4118...
394
+ >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method1)) # doctest: +ELLIPSIS
395
+ 0.4118...
396
+ >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method2)) # doctest: +ELLIPSIS
397
+ 0.4489...
398
+ >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method3)) # doctest: +ELLIPSIS
399
+ 0.4118...
400
+ >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method4)) # doctest: +ELLIPSIS
401
+ 0.4118...
402
+ >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method5)) # doctest: +ELLIPSIS
403
+ 0.4905...
404
+ >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method6)) # doctest: +ELLIPSIS
405
+ 0.4135...
406
+ >>> print(sentence_bleu([reference1], hypothesis1, smoothing_function=chencherry.method7)) # doctest: +ELLIPSIS
407
+ 0.4905...
408
+ :param epsilon: the epsilon value use in method 1
409
+ :type epsilon: float
410
+ :param alpha: the alpha value use in method 6
411
+ :type alpha: int
412
+ :param k: the k value use in method 4
413
+ :type k: int
414
+ """
415
+ self.epsilon = epsilon
416
+ self.alpha = alpha
417
+ self.k = k
418
+
419
+ def method0(self, p_n, *args, **kwargs):
420
+ """
421
+ No smoothing.
422
+ """
423
+ p_n_new = []
424
+ for i, p_i in enumerate(p_n):
425
+ if p_i[0] != 0:
426
+ p_n_new.append(p_i)
427
+ else:
428
+ _msg = str(
429
+ "\nThe hypothesis contains 0 counts of {}-gram overlaps.\n"
430
+ "Therefore the BLEU score evaluates to 0, independently of\n"
431
+ "how many N-gram overlaps of lower order it contains.\n"
432
+ "Consider using lower n-gram order or use "
433
+ "SmoothingFunction()"
434
+ ).format(i + 1)
435
+ warnings.warn(_msg)
436
+ # When numerator==0 where denonminator==0 or !=0, the result
437
+ # for the precision score should be equal to 0 or undefined.
438
+ # Due to BLEU geometric mean computation in logarithm space,
439
+ # we we need to take the return sys.float_info.min such that
440
+ # math.log(sys.float_info.min) returns a 0 precision score.
441
+ p_n_new.append(sys.float_info.min)
442
+ return p_n_new
443
+
444
+ def method1(self, p_n, *args, **kwargs):
445
+ """
446
+ Smoothing method 1: Add *epsilon* counts to precision with 0 counts.
447
+ """
448
+ return [
449
+ ((p_i[0] + self.epsilon), p_i[1])
450
+ if p_i[0] == 0
451
+ else p_i
452
+ for p_i in p_n
453
+ ]
454
+
455
+ def method2(self, p_n, *args, **kwargs):
456
+ """
457
+ Smoothing method 2: Add 1 to both numerator and denominator from
458
+ Chin-Yew Lin and Franz Josef Och (2004) Automatic evaluation of
459
+ machine translation quality using longest common subsequence and
460
+ skip-bigram statistics. In ACL04.
461
+ """
462
+ return [
463
+ (p_i[0] + 1, p_i[1] + 1)
464
+ for p_i in p_n
465
+ ]
466
+
467
+ def method3(self, p_n, *args, **kwargs):
468
+ """
469
+ Smoothing method 3: NIST geometric sequence smoothing
470
+ The smoothing is computed by taking 1 / ( 2^k ), instead of 0, for each
471
+ precision score whose matching n-gram count is null.
472
+ k is 1 for the first 'n' value for which the n-gram match count is null/
473
+ For example, if the text contains:
474
+ - one 2-gram match
475
+ - and (consequently) two 1-gram matches
476
+ the n-gram count for each individual precision score would be:
477
+ - n=1 => prec_count = 2 (two unigrams)
478
+ - n=2 => prec_count = 1 (one bigram)
479
+ - n=3 => prec_count = 1/2 (no trigram, taking 'smoothed' value of 1 / ( 2^k ), with k=1)
480
+ - n=4 => prec_count = 1/4 (no fourgram, taking 'smoothed' value of 1 / ( 2^k ), with k=2)
481
+ """
482
+ incvnt = 1 # From the mteval-v13a.pl, it's referred to as k.
483
+ for i, p_i in enumerate(p_n):
484
+ if p_i.numerator == 0:
485
+ p_n[i] = 1 / (2 ** incvnt * p_i.denominator)
486
+ incvnt += 1
487
+ return p_n
488
+
489
+ def method4(self, p_n, references, hypothesis, hyp_len=None, *args, **kwargs):
490
+ """
491
+ Smoothing method 4:
492
+ Shorter translations may have inflated precision values due to having
493
+ smaller denominators; therefore, we give them proportionally
494
+ smaller smoothed counts. Instead of scaling to 1/(2^k), Chen and Cherry
495
+ suggests dividing by 1/ln(len(T)), where T is the length of the translation.
496
+ """
497
+ hyp_len = hyp_len if hyp_len else len(hypothesis)
498
+ for i, p_i in enumerate(p_n):
499
+ if p_i.numerator == 0 and hyp_len != 0:
500
+ incvnt = i + 1 * self.k / math.log(
501
+ hyp_len
502
+ ) # Note that this K is different from the K from NIST.
503
+ p_n[i] = incvnt / p_i.denominator
504
+ return p_n
505
+
506
+ def method5(self, p_n, references, hypothesis, hyp_len=None, *args, **kwargs):
507
+ """
508
+ Smoothing method 5:
509
+ The matched counts for similar values of n should be similar. To a
510
+ calculate the n-gram matched count, it averages the n−1, n and n+1 gram
511
+ matched counts.
512
+ """
513
+ hyp_len = hyp_len if hyp_len else len(hypothesis)
514
+ m = {}
515
+ # Requires an precision value for an addition ngram order.
516
+ p_n_plus1 = p_n + [modified_precision(references, hypothesis, 5)]
517
+ m[-1] = p_n[0] + 1
518
+ for i, p_i in enumerate(p_n):
519
+ p_n[i] = (m[i - 1] + p_i + p_n_plus1[i + 1]) / 3
520
+ m[i] = p_n[i]
521
+ return p_n
522
+
523
+ def method6(self, p_n, references, hypothesis, hyp_len=None, *args, **kwargs):
524
+ """
525
+ Smoothing method 6:
526
+ Interpolates the maximum likelihood estimate of the precision *p_n* with
527
+ a prior estimate *pi0*. The prior is estimated by assuming that the ratio
528
+ between pn and pn−1 will be the same as that between pn−1 and pn−2; from
529
+ Gao and He (2013) Training MRF-Based Phrase Translation Models using
530
+ Gradient Ascent. In NAACL.
531
+ """
532
+ hyp_len = hyp_len if hyp_len else len(hypothesis)
533
+ # This smoothing only works when p_1 and p_2 is non-zero.
534
+ # Raise an error with an appropriate message when the input is too short
535
+ # to use this smoothing technique.
536
+ assert p_n[2], "This smoothing method requires non-zero precision for bigrams."
537
+ for i, p_i in enumerate(p_n):
538
+ if i in [0, 1]: # Skips the first 2 orders of ngrams.
539
+ continue
540
+ else:
541
+ pi0 = 0 if p_n[i - 2] == 0 else p_n[i - 1] ** 2 / p_n[i - 2]
542
+ # No. of ngrams in translation that matches the reference.
543
+ m = p_i.numerator
544
+ # No. of ngrams in translation.
545
+ l = sum(1 for _ in ngrams(hypothesis, i + 1))
546
+ # Calculates the interpolated precision.
547
+ p_n[i] = (m + self.alpha * pi0) / (l + self.alpha)
548
+ return p_n
549
+
550
+ def method7(self, p_n, references, hypothesis, hyp_len=None, *args, **kwargs):
551
+ """
552
+ Smoothing method 7:
553
+ Interpolates methods 4 and 5.
554
+ """
555
+ hyp_len = hyp_len if hyp_len else len(hypothesis)
556
+ p_n = self.method4(p_n, references, hypothesis, hyp_len)
557
+ p_n = self.method5(p_n, references, hypothesis, hyp_len)
558
+ return p_n