File size: 4,748 Bytes
e030ae6
 
9df4338
e030ae6
 
 
 
9df4338
e030ae6
 
 
9df4338
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c0cf912
9df4338
 
7517145
 
9df4338
e030ae6
 
 
 
9df4338
1f57142
 
bd89db0
1f57142
9df4338
e030ae6
 
 
 
 
 
445b401
1f57142
445b401
 
e030ae6
 
9df4338
e030ae6
c88b60d
2a2e9cb
9df4338
 
 
 
 
e030ae6
 
 
 
1f57142
e030ae6
 
 
5e02842
 
 
85054d0
 
 
861be40
85054d0
 
 
 
 
 
 
 
861be40
85054d0
861be40
c24bdb1
85054d0
 
e030ae6
 
 
be193e7
9df4338
e030ae6
9df4338
 
 
 
 
 
 
 
e030ae6
 
 
 
 
 
 
 
9df4338
 
 
 
 
861be40
 
 
e030ae6
 
 
9df4338
e030ae6
 
9df4338
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
import torch
import transformers
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, AutoModelForSequenceClassification
import gradio as gr
import os

model_name = 'eliolio/bart-finetuned-yelpreviews'
bert_model_name = 'eliolio/bert-correlation-yelpreviews'

access_token = os.environ.get('private_token')

model = AutoModelForSeq2SeqLM.from_pretrained(
    model_name, use_auth_token=access_token
)
tokenizer = AutoTokenizer.from_pretrained(
    model_name, use_auth_token=access_token
)

bert_tokenizer = AutoTokenizer.from_pretrained(
    bert_model_name, use_auth_token=access_token
)
bert_model = AutoModelForSequenceClassification.from_pretrained(
    bert_model_name, use_auth_token=access_token
)


def correlation_score(table, review):
    # Compute the correlation score
    args = ((table, review))
    inputs = bert_tokenizer(*args, padding=True, max_length=128, truncation=True, return_tensors="pt")
    logits = bert_model(**inputs).logits
    probs = logits.softmax(dim=-1)
    return {
        "correlated": probs[:, 1].item(),
        "uncorrelated": probs[:, 0].item()
    }

def create_prompt(stars, useful, funny, cool):
    return f"Generate review: stars: {stars}, useful: {useful}, funny: {funny}, cool: {cool}"


def postprocess(review):
    dot = review.rfind('.')
    return review[:dot+1]


def generate_reviews(stars, useful, funny, cool):
    text = create_prompt(stars, useful, funny, cool)
    inputs = tokenizer(text, return_tensors='pt')
    out = model.generate(
        input_ids=inputs.input_ids,
        attention_mask=inputs.attention_mask,
        do_sample=True,
        num_return_sequences=3,
        temperature=1.2,
        top_p=0.9
    )
    reviews = []
    scores = []
    for review in out:
        reviews.append(postprocess(tokenizer.decode(review, skip_special_tokens=True)))
    for review in reviews:
        scores.append(
            correlation_score(text[17:], review)
        )

    return reviews[0], reviews[1], reviews[2], scores[0], scores[1], scores[2]


css = """
    #ctr {text-align: center;}
    #btn {color: white; background: linear-gradient( 90deg,  rgba(255,166,0,1) 14.7%, rgba(255,99,97,1) 73% );}
"""


md_text = """<h1 style='text-align: center; margin-bottom: 1rem'>Generating Yelp reviews with BART-base ⭐⭐⭐</h1>

This space demonstrates how synthetic data generation can be performed on natural language columns, as found in the Yelp reviews dataset.   

| review id | stars | useful | funny | cool | text |
|:---:|:---:|:---:|:---:|:---:|:---:|
| 0 | 5 | 1 | 0 | 1 | "Wow! Yummy, different, delicious. Our favorite is the lamb curry and korma. With 10 different kinds of naan!!!  Don't let the outside deter you (because we almost changed our minds)...go in and try something new! You'll be glad you did!"




The model is a fine-tuned version of [facebook/bart-base](https://chatgptweb.us.kgm/facebook/bart-base) on Yelp reviews with the following input-output pairs:

- **Input**: "Generate review: stars: 5, useful: 1, funny: 0, cool: 1"
- **Output**: "Wow!  Yummy, different,  delicious.   Our favorite is the lamb curry and korma.  With 10 different kinds of naan!!!  Don't let the outside deter you (because we almost changed our minds)...go in and try something new!   You'll be glad you did!"
"""

resources = """## Resources
- Code for training: [github repo](https://github.com/EliottZemour/yelp-reviews/)
- The Yelp reviews dataset can be found in json format [here](https://www.yelp.com/dataset)."""

demo = gr.Blocks(css=css)
with demo:
    with gr.Row():
        gr.Markdown(md_text)

    with gr.Row():
        stars = gr.inputs.Slider(minimum=0, maximum=5,
                                 step=1, default=0, label="stars")
        useful = gr.inputs.Slider(
            minimum=0, maximum=5, step=1, default=0, label="useful")
        funny = gr.inputs.Slider(minimum=0, maximum=5,
                                 step=1, default=0, label="funny")
        cool = gr.inputs.Slider(minimum=0, maximum=5,
                                step=1, default=0, label="cool")
    with gr.Row():
        button = gr.Button("Generate reviews !", elem_id='btn')

    with gr.Row():
        output1 = gr.Textbox(label="Review #1")
        output2 = gr.Textbox(label="Review #2")
        output3 = gr.Textbox(label="Review #3")

    with gr.Row():
        score1 = gr.Label(label="Correlation score #1")
        score2 = gr.Label(label="Correlation score #2")
        score3 = gr.Label(label="Correlation score #3")

    with gr.Row():
        gr.Markdown(resources)

    button.click(
        fn=generate_reviews,
        inputs=[stars, useful, funny, cool],
        outputs=[output1, output2, output3, score1, score2, score3]
    )

demo.launch()