diff --git a/convert.py b/convert.py
index 8bb6c7e41..20e27aa42 100755
--- a/convert.py
+++ b/convert.py
@@ -947,6 +947,7 @@ class OutputFile:
             elapsed = time.time() - start
             size = ' x '.join(f"{dim:6d}" for dim in lazy_tensor.shape)
             padi = len(str(len(model)))
+            ndarray.byteswap(inplace=True)
             print(f"[{i+1:{padi}d}/{len(model)}] Writing tensor {name:38s} | size {size:16} | type {lazy_tensor.data_type.name:4} | T+{int(elapsed):4}")
             of.gguf.write_tensor_data(ndarray)
 
diff --git a/gguf-py/gguf/gguf.py b/gguf-py/gguf/gguf.py
index 598cf8e59..2e997f72a 100644
--- a/gguf-py/gguf/gguf.py
+++ b/gguf-py/gguf/gguf.py
@@ -22,6 +22,7 @@ GGUF_MAGIC             = 0x46554747
 GGUF_VERSION           = 2
 GGUF_DEFAULT_ALIGNMENT = 32
 
+
 # general
 KEY_GENERAL_ARCHITECTURE         = "general.architecture"
 KEY_GENERAL_QUANTIZATION_VERSION = "general.quantization_version"
@@ -428,7 +429,6 @@ class GGMLQuantizationType(IntEnum):
     Q6_K = 14
     Q8_K = 15
 
-
 class GGUFValueType(IntEnum):
     UINT8   = 0
     INT8    = 1
@@ -483,10 +483,10 @@ class GGUFWriter:
         self.tensors = []
 
     def write_header_to_file(self):
-        self.fout.write(struct.pack("<I", GGUF_MAGIC))
-        self.fout.write(struct.pack("<I", GGUF_VERSION))
-        self.fout.write(struct.pack("<Q", self.ti_data_count))
-        self.fout.write(struct.pack("<Q", self.kv_data_count))
+        self.fout.write(struct.pack(">I", GGUF_MAGIC))
+        self.fout.write(struct.pack(">I", GGUF_VERSION))
+        self.fout.write(struct.pack(">Q", self.ti_data_count))
+        self.fout.write(struct.pack(">Q", self.kv_data_count))
         self.flush()
 #        print("tensors " + str(self.ti_data_count) + " kv " + str(self.kv_data_count))
 
@@ -559,16 +559,16 @@ class GGUFWriter:
         self.add_val(val, GGUFValueType.ARRAY)
 
     _simple_value_packing = {
-        GGUFValueType.UINT8:   "<B",
-        GGUFValueType.INT8:    "<b",
-        GGUFValueType.UINT16:  "<H",
-        GGUFValueType.INT16:   "<h",
-        GGUFValueType.UINT32:  "<I",
-        GGUFValueType.INT32:   "<i",
-        GGUFValueType.FLOAT32: "<f",
-        GGUFValueType.UINT64:  "<Q",
-        GGUFValueType.INT64:   "<q",
-        GGUFValueType.FLOAT64: "<d",
+        GGUFValueType.UINT8:   f"{GGUF_ENDIANESS}B",
+        GGUFValueType.INT8:    f"{GGUF_ENDIANESS.}b",
+        GGUFValueType.UINT16:  f"{GGUF_ENDIANESS.get}H",
+        GGUFValueType.INT16:   ">h",
+        GGUFValueType.UINT32:  ">I",
+        GGUFValueType.INT32:   ">i",
+        GGUFValueType.FLOAT32: ">f",
+        GGUFValueType.UINT64:  ">Q",
+        GGUFValueType.INT64:   ">q",
+        GGUFValueType.FLOAT64: ">d",
         GGUFValueType.BOOL:    "?" ,
     }
     def add_val(self, val: Any, vtype: GGUFValueType | None = None, add_vtype: bool = True):
@@ -576,7 +576,7 @@ class GGUFWriter:
             vtype = GGUFValueType.get_type(val)
 
         if add_vtype:
-            self.kv_data += struct.pack("<I", vtype)
+            self.kv_data += struct.pack(">I", vtype)
             self.kv_data_count += 1
 
         pack_fmt = self._simple_value_packing.get(vtype)
@@ -584,14 +584,14 @@ class GGUFWriter:
             self.kv_data += struct.pack(pack_fmt, val)
         elif vtype == GGUFValueType.STRING:
             encoded_val = val.encode("utf8") if isinstance(val, str) else val
-            self.kv_data += struct.pack("<Q", len(encoded_val))
+            self.kv_data += struct.pack(">Q", len(encoded_val))
             self.kv_data += encoded_val
         elif vtype == GGUFValueType.ARRAY and isinstance(val, Sequence) and len(val) > 0:
             ltype = GGUFValueType.get_type(val[0])
             if not all(GGUFValueType.get_type(i) is ltype for i in val[1:]):
                 raise ValueError("All items in a GGUF array should be of the same type")
-            self.kv_data += struct.pack("<I", ltype)
-            self.kv_data += struct.pack("<Q", len(val))
+            self.kv_data += struct.pack(">I", ltype)
+            self.kv_data += struct.pack(">Q", len(val))
             for item in val:
                 self.add_val(item, add_vtype=False)
         else:
@@ -605,22 +605,23 @@ class GGUFWriter:
         assert raw_dtype is not None or tensor_dtype in (np.float32, np.float16), "Only F32 and F16 tensors are supported for now"
 
         encoded_name = name.encode("utf8")
-        self.ti_data += struct.pack("<Q", len(encoded_name))
+        self.ti_data += struct.pack(">Q", len(encoded_name))
         self.ti_data += encoded_name
         n_dims = len(tensor_shape)
-        self.ti_data += struct.pack("<I", n_dims)
+        self.ti_data += struct.pack(">I", n_dims)
         for i in range(n_dims):
-            self.ti_data += struct.pack("<Q", tensor_shape[n_dims - 1 - i])
+            self.ti_data += struct.pack(">Q", tensor_shape[n_dims - 1 - i])
         if raw_dtype is None:
             dtype = GGMLQuantizationType.F32 if tensor_dtype == np.float32 else GGMLQuantizationType.F16
         else:
             dtype = raw_dtype
-        self.ti_data += struct.pack("<I", dtype)
-        self.ti_data += struct.pack("<Q", self.offset_tensor)
+        self.ti_data += struct.pack(">I", dtype)
+        self.ti_data += struct.pack(">Q", self.offset_tensor)
         self.offset_tensor += GGUFWriter.ggml_pad(tensor_nbytes, self.data_alignment)
         self.ti_data_count += 1
 
     def add_tensor(self, name: str, tensor: np.ndarray[Any, Any], raw_shape: Sequence[int] | None = None, raw_dtype: GGMLQuantizationType | None = None):
+        tensor.byteswap(inplace=True)
         if self.use_temp_file and self.temp_file is None:
             fp = tempfile.SpooledTemporaryFile(mode="w+b", max_size=256*1024*1024)
             fp.seek(0)
diff --git a/tests/test-double-float.cpp b/tests/test-double-float.cpp
index b506f273f..afd7bf77f 100644
--- a/tests/test-double-float.cpp
+++ b/tests/test-double-float.cpp
@@ -4,7 +4,9 @@
 
 #undef NDEBUG
 #include <cassert>
+#if !defined(__riscv) && !defined(__s390__)
 #include <immintrin.h>
+#endif
 #include <cmath>
 #include <cstdint>
 #include <cstring>