diff --git a/.devops/tools.sh b/.devops/tools.sh
index ece9e4efa..860a7e891 100755
--- a/.devops/tools.sh
+++ b/.devops/tools.sh
@@ -11,7 +11,7 @@ shift
 arg2="$@"
 
 if [[ $arg1 == '--convert' || $arg1 == '-c' ]]; then
-    python3 ./convert-pth-to-ggml.py $arg2
+    python3 ./convert.py $arg2
 elif [[ $arg1 == '--quantize' || $arg1 == '-q' ]]; then
     ./quantize $arg2
 elif [[ $arg1 == '--run' || $arg1 == '-r' ]]; then
@@ -32,7 +32,7 @@ else
     echo "  --run (-r): Run a model previously converted into ggml"
     echo "              ex: -m /models/7B/ggml-model-q4_0.bin -p \"Building a website can be done in 10 simple steps:\" -n 512"
     echo "  --convert (-c): Convert a llama model into ggml"
-    echo "              ex: \"/models/7B/\" 1"
+    echo "              ex: --outtype f16 \"/models/7B/\" "
     echo "  --quantize (-q): Optimize with quantization process ggml"
     echo "              ex: \"/models/7B/ggml-model-f16.bin\" \"/models/7B/ggml-model-q4_0.bin\" 2"
     echo "  --all-in-one (-a): Execute --convert & --quantize"
diff --git a/README.md b/README.md
index 00571d8e1..aba22b9a0 100644
--- a/README.md
+++ b/README.md
@@ -310,6 +310,8 @@ Building the program with BLAS support may lead to some performance improvements
     ```
   Note: Because llama.cpp uses multiple CUDA streams for matrix multiplication results [are not guaranteed to be reproducible](https://docs.nvidia.com/cuda/cublas/index.html#results-reproducibility). If you need reproducibility, set `GGML_CUDA_MAX_STREAMS` in the file `ggml-cuda.cu` to 1.
 
+  The environment variable [`CUDA_VISIBLE_DEVICES`](https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#env-vars) can be used to specify which GPU(s) will be used.
+
 - **CLBlast**
 
   OpenCL acceleration is provided by the matrix multiplication kernels from the [CLBlast](https://github.com/CNugteren/CLBlast) project and custom kernels for ggml that can generate tokens on the GPU.
@@ -348,7 +350,7 @@ Building the program with BLAS support may lead to some performance improvements
       cmake --install . --prefix /some/path
       ```
 
-      Where `/some/path` is where the built library will be installed (default is `/usr/loca`l`).
+      Where `/some/path` is where the built library will be installed (default is `/usr/local`).
     </details>
 
   Building:
diff --git a/convert-pth-to-ggml.py b/convert-pth-to-ggml.py
index f87ac270c..dd15393c3 100644
--- a/convert-pth-to-ggml.py
+++ b/convert-pth-to-ggml.py
@@ -4,7 +4,9 @@ import argparse
 
 import convert
 
-parser = argparse.ArgumentParser(description='Convert a LLaMA model checkpoint to a ggml compatible file')
+parser = argparse.ArgumentParser(
+    description="""[DEPRECATED - use `convert.py` instead]
+    Convert a LLaMA model checkpoint to a ggml compatible file""")
 parser.add_argument('dir_model',  help='directory containing the model checkpoint')
 parser.add_argument('ftype',      help='file type (0: float32, 1: float16)', type=int, choices=[0, 1], default=1)
 args = parser.parse_args()
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
index 6131f5b46..57cc1e486 100644
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -202,6 +202,13 @@ int main(int argc, char ** argv) {
         }
     }
 
+    // if we will use the cache for the full prompt without reaching the end of the cache, force
+    // reevaluation of the last token token to recalculate the cached logits
+    if (!embd_inp.empty() && n_matching_session_tokens == embd_inp.size() &&
+            session_tokens.size() > embd_inp.size()) {
+        session_tokens.resize(embd_inp.size() - 1);
+    }
+
     // number of tokens to keep when resetting context
     if (params.n_keep < 0 || params.n_keep > (int) embd_inp.size() || params.instruct) {
         params.n_keep = (int)embd_inp.size();
@@ -360,12 +367,6 @@ int main(int argc, char ** argv) {
                     }
                 }
                 if (i > 0) {
-                    // check if we've used up all the prompt but not all cached tokens
-                    if (embd.size() == i && n_session_consumed < (int) session_tokens.size()) {
-                        // force revaluation of the last token to recalculate logits
-                        i--;
-                        n_past--;
-                    }
                     embd.erase(embd.begin(), embd.begin() + i);
                 }
             }
diff --git a/ggml-opencl.cpp b/ggml-opencl.cpp
index ca3908763..c0253c5e6 100644
--- a/ggml-opencl.cpp
+++ b/ggml-opencl.cpp
@@ -717,6 +717,7 @@ static void ggml_cl_mul_f32(const ggml_tensor * src0, const ggml_tensor * src1,
     cl_mem d_Y = (cl_mem) src1->data; // src1 is already on device, broadcasted.
     cl_mem d_D = ggml_cl_pool_malloc(ne0 * sizeof(float), &d_size, CL_MEM_READ_WRITE); // dst
 
+
     for (int64_t i03 = 0; i03 < ne03; i03++) {
         for (int64_t i02 = 0; i02 < ne02; i02++) {
             const int i0 = i03*ne02 + i02;